VidSimplify / manimator /utils /content_preprocessor.py
Adityahulk
adding pdf parsing logic correctly
3ccc955
"""
Content Preprocessor for Long Inputs
Handles very long content (PDFs, large text) by:
1. Chunking content into logical sections
2. Numbering sections for explicit coverage
3. Ensuring proportional representation in the video
"""
import logging
import re
from typing import List, Tuple
logger = logging.getLogger(__name__)
def chunk_content(content: str, max_words_per_chunk: int = 150) -> List[str]:
"""
Split content into logical chunks based on paragraphs and sentences.
Args:
content: The full text content
max_words_per_chunk: Target words per chunk (will be approximate)
Returns:
List of content chunks
"""
# First, split by double newlines (paragraphs)
paragraphs = re.split(r'\n\s*\n', content.strip())
paragraphs = [p.strip() for p in paragraphs if p.strip()]
chunks = []
current_chunk = []
current_word_count = 0
for para in paragraphs:
para_words = len(para.split())
# If paragraph itself is too long, split by sentences
if para_words > max_words_per_chunk:
# Commit current chunk first
if current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = []
current_word_count = 0
# Split paragraph by sentences
sentences = re.split(r'(?<=[.!?])\s+', para)
temp_chunk = []
temp_count = 0
for sentence in sentences:
sent_words = len(sentence.split())
if temp_count + sent_words > max_words_per_chunk and temp_chunk:
chunks.append(' '.join(temp_chunk))
temp_chunk = [sentence]
temp_count = sent_words
else:
temp_chunk.append(sentence)
temp_count += sent_words
if temp_chunk:
chunks.append(' '.join(temp_chunk))
else:
# Normal paragraph - add to current chunk
if current_word_count + para_words > max_words_per_chunk and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [para]
current_word_count = para_words
else:
current_chunk.append(para)
current_word_count += para_words
# Don't forget the last chunk
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def preprocess_long_content(content: str) -> Tuple[str, int]:
"""
Preprocess long content by chunking and adding section markers.
For very long content (>1000 words), this creates a structured format
with numbered sections that the LLM MUST cover proportionally.
Args:
content: The raw content from PDF/text input
Returns:
Tuple of (processed_content, section_count)
"""
word_count = len(content.split())
# For shorter content, return as-is
if word_count <= 1000:
return content, 0
logger.info(f"πŸ“„ Preprocessing very long content: {word_count} words")
# Calculate appropriate chunk size based on content length
# Longer content = smaller chunks to ensure coverage
if word_count > 5000:
max_words = 120 # Very long - more sections
elif word_count > 3000:
max_words = 150
elif word_count > 2000:
max_words = 180
else:
max_words = 200
chunks = chunk_content(content, max_words_per_chunk=max_words)
section_count = len(chunks)
logger.info(f"πŸ“„ Split into {section_count} sections (avg ~{word_count // section_count} words each)")
# Create structured content with numbered sections
structured_parts = []
structured_parts.append(f"# STRUCTURED CONTENT ({section_count} SECTIONS)")
structured_parts.append(f"# YOU MUST CREATE A VOICEOVER BLOCK FOR EACH SECTION BELOW")
structured_parts.append(f"# Video should cover ALL {section_count} sections proportionally")
structured_parts.append("")
for i, chunk in enumerate(chunks, 1):
structured_parts.append(f"=== SECTION {i} OF {section_count} ===")
structured_parts.append(chunk)
structured_parts.append("")
return '\n'.join(structured_parts), section_count
def get_script_mode_prompt_for_long_content(goal: str, section_count: int) -> str:
"""
Generate the user prompt for very long (chunked) content.
This prompt explicitly instructs the LLM to cover ALL sections
with DETAILED, HIGH-QUALITY animations - not rushed content.
"""
# Cap sections to a reasonable number for quality
effective_sections = min(section_count, 12)
return f"""Create a DETAILED animated video from this document.
CONTENT TO ANIMATE:
{goal}
CRITICAL REQUIREMENTS:
1. CREATE {effective_sections} DISTINCT SECTIONS - each with its own voiceover block
2. EACH SECTION MUST BE 20-40 SECONDS with rich animations
3. USE VARIED ANIMATIONS: FadeIn, Write, GrowFromCenter, LaggedStart, Indicate, Circumscribe
4. DO NOT RUSH - build visuals progressively in each section
5. CLEAN TRANSITIONS between sections using FadeOut before new content
6. USE THE ACTUAL TEXT from each section as voiceover content
DO NOT:
- Create only 1-2 voiceover blocks
- Rush through in 5 seconds
- Skip middle content
- Use only Write() for everything
VIDEO DURATION: Approximately {effective_sections * 30} seconds total
Each section should have:
- A title/header animation
- Multiple visual elements built progressively
- Emphasis animations (Indicate, Circumscribe)
- Clean transition to next section
"""