File size: 5,751 Bytes
3ccc955
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""
Content Preprocessor for Long Inputs

Handles very long content (PDFs, large text) by:
1. Chunking content into logical sections
2. Numbering sections for explicit coverage
3. Ensuring proportional representation in the video
"""

import logging
import re
from typing import List, Tuple

logger = logging.getLogger(__name__)


def chunk_content(content: str, max_words_per_chunk: int = 150) -> List[str]:
    """
    Split content into logical chunks based on paragraphs and sentences.
    
    Args:
        content: The full text content
        max_words_per_chunk: Target words per chunk (will be approximate)
        
    Returns:
        List of content chunks
    """
    # First, split by double newlines (paragraphs)
    paragraphs = re.split(r'\n\s*\n', content.strip())
    paragraphs = [p.strip() for p in paragraphs if p.strip()]
    
    chunks = []
    current_chunk = []
    current_word_count = 0
    
    for para in paragraphs:
        para_words = len(para.split())
        
        # If paragraph itself is too long, split by sentences
        if para_words > max_words_per_chunk:
            # Commit current chunk first
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_word_count = 0
            
            # Split paragraph by sentences
            sentences = re.split(r'(?<=[.!?])\s+', para)
            temp_chunk = []
            temp_count = 0
            
            for sentence in sentences:
                sent_words = len(sentence.split())
                if temp_count + sent_words > max_words_per_chunk and temp_chunk:
                    chunks.append(' '.join(temp_chunk))
                    temp_chunk = [sentence]
                    temp_count = sent_words
                else:
                    temp_chunk.append(sentence)
                    temp_count += sent_words
            
            if temp_chunk:
                chunks.append(' '.join(temp_chunk))
        else:
            # Normal paragraph - add to current chunk
            if current_word_count + para_words > max_words_per_chunk and current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = [para]
                current_word_count = para_words
            else:
                current_chunk.append(para)
                current_word_count += para_words
    
    # Don't forget the last chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks


def preprocess_long_content(content: str) -> Tuple[str, int]:
    """
    Preprocess long content by chunking and adding section markers.
    
    For very long content (>1000 words), this creates a structured format
    with numbered sections that the LLM MUST cover proportionally.
    
    Args:
        content: The raw content from PDF/text input
        
    Returns:
        Tuple of (processed_content, section_count)
    """
    word_count = len(content.split())
    
    # For shorter content, return as-is
    if word_count <= 1000:
        return content, 0
    
    logger.info(f"πŸ“„ Preprocessing very long content: {word_count} words")
    
    # Calculate appropriate chunk size based on content length
    # Longer content = smaller chunks to ensure coverage
    if word_count > 5000:
        max_words = 120  # Very long - more sections
    elif word_count > 3000:
        max_words = 150
    elif word_count > 2000:
        max_words = 180
    else:
        max_words = 200
    
    chunks = chunk_content(content, max_words_per_chunk=max_words)
    section_count = len(chunks)
    
    logger.info(f"πŸ“„ Split into {section_count} sections (avg ~{word_count // section_count} words each)")
    
    # Create structured content with numbered sections
    structured_parts = []
    structured_parts.append(f"# STRUCTURED CONTENT ({section_count} SECTIONS)")
    structured_parts.append(f"# YOU MUST CREATE A VOICEOVER BLOCK FOR EACH SECTION BELOW")
    structured_parts.append(f"# Video should cover ALL {section_count} sections proportionally")
    structured_parts.append("")
    
    for i, chunk in enumerate(chunks, 1):
        structured_parts.append(f"=== SECTION {i} OF {section_count} ===")
        structured_parts.append(chunk)
        structured_parts.append("")
    
    return '\n'.join(structured_parts), section_count


def get_script_mode_prompt_for_long_content(goal: str, section_count: int) -> str:
    """
    Generate the user prompt for very long (chunked) content.
    
    This prompt explicitly instructs the LLM to cover ALL sections
    with DETAILED, HIGH-QUALITY animations - not rushed content.
    """
    # Cap sections to a reasonable number for quality
    effective_sections = min(section_count, 12)
    
    return f"""Create a DETAILED animated video from this document.

CONTENT TO ANIMATE:
{goal}

CRITICAL REQUIREMENTS:

1. CREATE {effective_sections} DISTINCT SECTIONS - each with its own voiceover block
2. EACH SECTION MUST BE 20-40 SECONDS with rich animations
3. USE VARIED ANIMATIONS: FadeIn, Write, GrowFromCenter, LaggedStart, Indicate, Circumscribe
4. DO NOT RUSH - build visuals progressively in each section
5. CLEAN TRANSITIONS between sections using FadeOut before new content
6. USE THE ACTUAL TEXT from each section as voiceover content

DO NOT:
- Create only 1-2 voiceover blocks
- Rush through in 5 seconds
- Skip middle content
- Use only Write() for everything

VIDEO DURATION: Approximately {effective_sections * 30} seconds total

Each section should have:
- A title/header animation
- Multiple visual elements built progressively
- Emphasis animations (Indicate, Circumscribe)
- Clean transition to next section
"""