Spaces:

aditya2001
/

VidSimplify

Running

File size: 5,751 Bytes

3ccc955

"""
Content Preprocessor for Long Inputs

Handles very long content (PDFs, large text) by:
1. Chunking content into logical sections
2. Numbering sections for explicit coverage
3. Ensuring proportional representation in the video
"""

import logging
import re
from typing import List, Tuple

logger = logging.getLogger(__name__)


def chunk_content(content: str, max_words_per_chunk: int = 150) -> List[str]:
    """
    Split content into logical chunks based on paragraphs and sentences.
    
    Args:
        content: The full text content
        max_words_per_chunk: Target words per chunk (will be approximate)
        
    Returns:
        List of content chunks
    """
    # First, split by double newlines (paragraphs)
    paragraphs = re.split(r'\n\s*\n', content.strip())
    paragraphs = [p.strip() for p in paragraphs if p.strip()]
    
    chunks = []
    current_chunk = []
    current_word_count = 0
    
    for para in paragraphs:
        para_words = len(para.split())
        
        # If paragraph itself is too long, split by sentences
        if para_words > max_words_per_chunk:
            # Commit current chunk first
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_word_count = 0
            
            # Split paragraph by sentences
            sentences = re.split(r'(?<=[.!?])\s+', para)
            temp_chunk = []
            temp_count = 0
            
            for sentence in sentences:
                sent_words = len(sentence.split())
                if temp_count + sent_words > max_words_per_chunk and temp_chunk:
                    chunks.append(' '.join(temp_chunk))
                    temp_chunk = [sentence]
                    temp_count = sent_words
                else:
                    temp_chunk.append(sentence)
                    temp_count += sent_words
            
            if temp_chunk:
                chunks.append(' '.join(temp_chunk))
        else:
            # Normal paragraph - add to current chunk
            if current_word_count + para_words > max_words_per_chunk and current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = [para]
                current_word_count = para_words
            else:
                current_chunk.append(para)
                current_word_count += para_words
    
    # Don't forget the last chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks


def preprocess_long_content(content: str) -> Tuple[str, int]:
    """
    Preprocess long content by chunking and adding section markers.
    
    For very long content (>1000 words), this creates a structured format
    with numbered sections that the LLM MUST cover proportionally.
    
    Args:
        content: The raw content from PDF/text input
        
    Returns:
        Tuple of (processed_content, section_count)
    """
    word_count = len(content.split())
    
    # For shorter content, return as-is
    if word_count <= 1000:
        return content, 0
    
    logger.info(f"📄 Preprocessing very long content: {word_count} words")
    
    # Calculate appropriate chunk size based on content length
    # Longer content = smaller chunks to ensure coverage
    if word_count > 5000:
        max_words = 120  # Very long - more sections
    elif word_count > 3000:
        max_words = 150
    elif word_count > 2000:
        max_words = 180
    else:
        max_words = 200
    
    chunks = chunk_content(content, max_words_per_chunk=max_words)
    section_count = len(chunks)
    
    logger.info(f"📄 Split into {section_count} sections (avg ~{word_count // section_count} words each)")
    
    # Create structured content with numbered sections
    structured_parts = []
    structured_parts.append(f"# STRUCTURED CONTENT ({section_count} SECTIONS)")
    structured_parts.append(f"# YOU MUST CREATE A VOICEOVER BLOCK FOR EACH SECTION BELOW")
    structured_parts.append(f"# Video should cover ALL {section_count} sections proportionally")
    structured_parts.append("")
    
    for i, chunk in enumerate(chunks, 1):
        structured_parts.append(f"=== SECTION {i} OF {section_count} ===")
        structured_parts.append(chunk)
        structured_parts.append("")
    
    return '\n'.join(structured_parts), section_count


def get_script_mode_prompt_for_long_content(goal: str, section_count: int) -> str:
    """
    Generate the user prompt for very long (chunked) content.
    
    This prompt explicitly instructs the LLM to cover ALL sections
    with DETAILED, HIGH-QUALITY animations - not rushed content.
    """
    # Cap sections to a reasonable number for quality
    effective_sections = min(section_count, 12)
    
    return f"""Create a DETAILED animated video from this document.

CONTENT TO ANIMATE:
{goal}

CRITICAL REQUIREMENTS:

1. CREATE {effective_sections} DISTINCT SECTIONS - each with its own voiceover block
2. EACH SECTION MUST BE 20-40 SECONDS with rich animations
3. USE VARIED ANIMATIONS: FadeIn, Write, GrowFromCenter, LaggedStart, Indicate, Circumscribe
4. DO NOT RUSH - build visuals progressively in each section
5. CLEAN TRANSITIONS between sections using FadeOut before new content
6. USE THE ACTUAL TEXT from each section as voiceover content

DO NOT:
- Create only 1-2 voiceover blocks
- Rush through in 5 seconds
- Skip middle content
- Use only Write() for everything

VIDEO DURATION: Approximately {effective_sections * 30} seconds total

Each section should have:
- A title/header animation
- Multiple visual elements built progressively
- Emphasis animations (Indicate, Circumscribe)
- Clean transition to next section
"""