Spaces:
Running
Running
| """ | |
| Content Preprocessor for Long Inputs | |
| Handles very long content (PDFs, large text) by: | |
| 1. Chunking content into logical sections | |
| 2. Numbering sections for explicit coverage | |
| 3. Ensuring proportional representation in the video | |
| """ | |
| import logging | |
| import re | |
| from typing import List, Tuple | |
| logger = logging.getLogger(__name__) | |
| def chunk_content(content: str, max_words_per_chunk: int = 150) -> List[str]: | |
| """ | |
| Split content into logical chunks based on paragraphs and sentences. | |
| Args: | |
| content: The full text content | |
| max_words_per_chunk: Target words per chunk (will be approximate) | |
| Returns: | |
| List of content chunks | |
| """ | |
| # First, split by double newlines (paragraphs) | |
| paragraphs = re.split(r'\n\s*\n', content.strip()) | |
| paragraphs = [p.strip() for p in paragraphs if p.strip()] | |
| chunks = [] | |
| current_chunk = [] | |
| current_word_count = 0 | |
| for para in paragraphs: | |
| para_words = len(para.split()) | |
| # If paragraph itself is too long, split by sentences | |
| if para_words > max_words_per_chunk: | |
| # Commit current chunk first | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| current_chunk = [] | |
| current_word_count = 0 | |
| # Split paragraph by sentences | |
| sentences = re.split(r'(?<=[.!?])\s+', para) | |
| temp_chunk = [] | |
| temp_count = 0 | |
| for sentence in sentences: | |
| sent_words = len(sentence.split()) | |
| if temp_count + sent_words > max_words_per_chunk and temp_chunk: | |
| chunks.append(' '.join(temp_chunk)) | |
| temp_chunk = [sentence] | |
| temp_count = sent_words | |
| else: | |
| temp_chunk.append(sentence) | |
| temp_count += sent_words | |
| if temp_chunk: | |
| chunks.append(' '.join(temp_chunk)) | |
| else: | |
| # Normal paragraph - add to current chunk | |
| if current_word_count + para_words > max_words_per_chunk and current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| current_chunk = [para] | |
| current_word_count = para_words | |
| else: | |
| current_chunk.append(para) | |
| current_word_count += para_words | |
| # Don't forget the last chunk | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| return chunks | |
| def preprocess_long_content(content: str) -> Tuple[str, int]: | |
| """ | |
| Preprocess long content by chunking and adding section markers. | |
| For very long content (>1000 words), this creates a structured format | |
| with numbered sections that the LLM MUST cover proportionally. | |
| Args: | |
| content: The raw content from PDF/text input | |
| Returns: | |
| Tuple of (processed_content, section_count) | |
| """ | |
| word_count = len(content.split()) | |
| # For shorter content, return as-is | |
| if word_count <= 1000: | |
| return content, 0 | |
| logger.info(f"π Preprocessing very long content: {word_count} words") | |
| # Calculate appropriate chunk size based on content length | |
| # Longer content = smaller chunks to ensure coverage | |
| if word_count > 5000: | |
| max_words = 120 # Very long - more sections | |
| elif word_count > 3000: | |
| max_words = 150 | |
| elif word_count > 2000: | |
| max_words = 180 | |
| else: | |
| max_words = 200 | |
| chunks = chunk_content(content, max_words_per_chunk=max_words) | |
| section_count = len(chunks) | |
| logger.info(f"π Split into {section_count} sections (avg ~{word_count // section_count} words each)") | |
| # Create structured content with numbered sections | |
| structured_parts = [] | |
| structured_parts.append(f"# STRUCTURED CONTENT ({section_count} SECTIONS)") | |
| structured_parts.append(f"# YOU MUST CREATE A VOICEOVER BLOCK FOR EACH SECTION BELOW") | |
| structured_parts.append(f"# Video should cover ALL {section_count} sections proportionally") | |
| structured_parts.append("") | |
| for i, chunk in enumerate(chunks, 1): | |
| structured_parts.append(f"=== SECTION {i} OF {section_count} ===") | |
| structured_parts.append(chunk) | |
| structured_parts.append("") | |
| return '\n'.join(structured_parts), section_count | |
| def get_script_mode_prompt_for_long_content(goal: str, section_count: int) -> str: | |
| """ | |
| Generate the user prompt for very long (chunked) content. | |
| This prompt explicitly instructs the LLM to cover ALL sections | |
| with DETAILED, HIGH-QUALITY animations - not rushed content. | |
| """ | |
| # Cap sections to a reasonable number for quality | |
| effective_sections = min(section_count, 12) | |
| return f"""Create a DETAILED animated video from this document. | |
| CONTENT TO ANIMATE: | |
| {goal} | |
| CRITICAL REQUIREMENTS: | |
| 1. CREATE {effective_sections} DISTINCT SECTIONS - each with its own voiceover block | |
| 2. EACH SECTION MUST BE 20-40 SECONDS with rich animations | |
| 3. USE VARIED ANIMATIONS: FadeIn, Write, GrowFromCenter, LaggedStart, Indicate, Circumscribe | |
| 4. DO NOT RUSH - build visuals progressively in each section | |
| 5. CLEAN TRANSITIONS between sections using FadeOut before new content | |
| 6. USE THE ACTUAL TEXT from each section as voiceover content | |
| DO NOT: | |
| - Create only 1-2 voiceover blocks | |
| - Rush through in 5 seconds | |
| - Skip middle content | |
| - Use only Write() for everything | |
| VIDEO DURATION: Approximately {effective_sections * 30} seconds total | |
| Each section should have: | |
| - A title/header animation | |
| - Multiple visual elements built progressively | |
| - Emphasis animations (Indicate, Circumscribe) | |
| - Clean transition to next section | |
| """ | |