""" Content Preprocessor for Long Inputs Handles very long content (PDFs, large text) by: 1. Chunking content into logical sections 2. Numbering sections for explicit coverage 3. Ensuring proportional representation in the video """ import logging import re from typing import List, Tuple logger = logging.getLogger(__name__) def chunk_content(content: str, max_words_per_chunk: int = 150) -> List[str]: """ Split content into logical chunks based on paragraphs and sentences. Args: content: The full text content max_words_per_chunk: Target words per chunk (will be approximate) Returns: List of content chunks """ # First, split by double newlines (paragraphs) paragraphs = re.split(r'\n\s*\n', content.strip()) paragraphs = [p.strip() for p in paragraphs if p.strip()] chunks = [] current_chunk = [] current_word_count = 0 for para in paragraphs: para_words = len(para.split()) # If paragraph itself is too long, split by sentences if para_words > max_words_per_chunk: # Commit current chunk first if current_chunk: chunks.append(' '.join(current_chunk)) current_chunk = [] current_word_count = 0 # Split paragraph by sentences sentences = re.split(r'(?<=[.!?])\s+', para) temp_chunk = [] temp_count = 0 for sentence in sentences: sent_words = len(sentence.split()) if temp_count + sent_words > max_words_per_chunk and temp_chunk: chunks.append(' '.join(temp_chunk)) temp_chunk = [sentence] temp_count = sent_words else: temp_chunk.append(sentence) temp_count += sent_words if temp_chunk: chunks.append(' '.join(temp_chunk)) else: # Normal paragraph - add to current chunk if current_word_count + para_words > max_words_per_chunk and current_chunk: chunks.append(' '.join(current_chunk)) current_chunk = [para] current_word_count = para_words else: current_chunk.append(para) current_word_count += para_words # Don't forget the last chunk if current_chunk: chunks.append(' '.join(current_chunk)) return chunks def preprocess_long_content(content: str) -> Tuple[str, int]: """ Preprocess long content by chunking and adding section markers. For very long content (>1000 words), this creates a structured format with numbered sections that the LLM MUST cover proportionally. Args: content: The raw content from PDF/text input Returns: Tuple of (processed_content, section_count) """ word_count = len(content.split()) # For shorter content, return as-is if word_count <= 1000: return content, 0 logger.info(f"📄 Preprocessing very long content: {word_count} words") # Calculate appropriate chunk size based on content length # Longer content = smaller chunks to ensure coverage if word_count > 5000: max_words = 120 # Very long - more sections elif word_count > 3000: max_words = 150 elif word_count > 2000: max_words = 180 else: max_words = 200 chunks = chunk_content(content, max_words_per_chunk=max_words) section_count = len(chunks) logger.info(f"📄 Split into {section_count} sections (avg ~{word_count // section_count} words each)") # Create structured content with numbered sections structured_parts = [] structured_parts.append(f"# STRUCTURED CONTENT ({section_count} SECTIONS)") structured_parts.append(f"# YOU MUST CREATE A VOICEOVER BLOCK FOR EACH SECTION BELOW") structured_parts.append(f"# Video should cover ALL {section_count} sections proportionally") structured_parts.append("") for i, chunk in enumerate(chunks, 1): structured_parts.append(f"=== SECTION {i} OF {section_count} ===") structured_parts.append(chunk) structured_parts.append("") return '\n'.join(structured_parts), section_count def get_script_mode_prompt_for_long_content(goal: str, section_count: int) -> str: """ Generate the user prompt for very long (chunked) content. This prompt explicitly instructs the LLM to cover ALL sections with DETAILED, HIGH-QUALITY animations - not rushed content. """ # Cap sections to a reasonable number for quality effective_sections = min(section_count, 12) return f"""Create a DETAILED animated video from this document. CONTENT TO ANIMATE: {goal} CRITICAL REQUIREMENTS: 1. CREATE {effective_sections} DISTINCT SECTIONS - each with its own voiceover block 2. EACH SECTION MUST BE 20-40 SECONDS with rich animations 3. USE VARIED ANIMATIONS: FadeIn, Write, GrowFromCenter, LaggedStart, Indicate, Circumscribe 4. DO NOT RUSH - build visuals progressively in each section 5. CLEAN TRANSITIONS between sections using FadeOut before new content 6. USE THE ACTUAL TEXT from each section as voiceover content DO NOT: - Create only 1-2 voiceover blocks - Rush through in 5 seconds - Skip middle content - Use only Write() for everything VIDEO DURATION: Approximately {effective_sections * 30} seconds total Each section should have: - A title/header animation - Multiple visual elements built progressively - Emphasis animations (Indicate, Circumscribe) - Clean transition to next section """