Spaces:
Running
Running
File size: 5,751 Bytes
3ccc955 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
"""
Content Preprocessor for Long Inputs
Handles very long content (PDFs, large text) by:
1. Chunking content into logical sections
2. Numbering sections for explicit coverage
3. Ensuring proportional representation in the video
"""
import logging
import re
from typing import List, Tuple
logger = logging.getLogger(__name__)
def chunk_content(content: str, max_words_per_chunk: int = 150) -> List[str]:
"""
Split content into logical chunks based on paragraphs and sentences.
Args:
content: The full text content
max_words_per_chunk: Target words per chunk (will be approximate)
Returns:
List of content chunks
"""
# First, split by double newlines (paragraphs)
paragraphs = re.split(r'\n\s*\n', content.strip())
paragraphs = [p.strip() for p in paragraphs if p.strip()]
chunks = []
current_chunk = []
current_word_count = 0
for para in paragraphs:
para_words = len(para.split())
# If paragraph itself is too long, split by sentences
if para_words > max_words_per_chunk:
# Commit current chunk first
if current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = []
current_word_count = 0
# Split paragraph by sentences
sentences = re.split(r'(?<=[.!?])\s+', para)
temp_chunk = []
temp_count = 0
for sentence in sentences:
sent_words = len(sentence.split())
if temp_count + sent_words > max_words_per_chunk and temp_chunk:
chunks.append(' '.join(temp_chunk))
temp_chunk = [sentence]
temp_count = sent_words
else:
temp_chunk.append(sentence)
temp_count += sent_words
if temp_chunk:
chunks.append(' '.join(temp_chunk))
else:
# Normal paragraph - add to current chunk
if current_word_count + para_words > max_words_per_chunk and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [para]
current_word_count = para_words
else:
current_chunk.append(para)
current_word_count += para_words
# Don't forget the last chunk
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def preprocess_long_content(content: str) -> Tuple[str, int]:
"""
Preprocess long content by chunking and adding section markers.
For very long content (>1000 words), this creates a structured format
with numbered sections that the LLM MUST cover proportionally.
Args:
content: The raw content from PDF/text input
Returns:
Tuple of (processed_content, section_count)
"""
word_count = len(content.split())
# For shorter content, return as-is
if word_count <= 1000:
return content, 0
logger.info(f"π Preprocessing very long content: {word_count} words")
# Calculate appropriate chunk size based on content length
# Longer content = smaller chunks to ensure coverage
if word_count > 5000:
max_words = 120 # Very long - more sections
elif word_count > 3000:
max_words = 150
elif word_count > 2000:
max_words = 180
else:
max_words = 200
chunks = chunk_content(content, max_words_per_chunk=max_words)
section_count = len(chunks)
logger.info(f"π Split into {section_count} sections (avg ~{word_count // section_count} words each)")
# Create structured content with numbered sections
structured_parts = []
structured_parts.append(f"# STRUCTURED CONTENT ({section_count} SECTIONS)")
structured_parts.append(f"# YOU MUST CREATE A VOICEOVER BLOCK FOR EACH SECTION BELOW")
structured_parts.append(f"# Video should cover ALL {section_count} sections proportionally")
structured_parts.append("")
for i, chunk in enumerate(chunks, 1):
structured_parts.append(f"=== SECTION {i} OF {section_count} ===")
structured_parts.append(chunk)
structured_parts.append("")
return '\n'.join(structured_parts), section_count
def get_script_mode_prompt_for_long_content(goal: str, section_count: int) -> str:
"""
Generate the user prompt for very long (chunked) content.
This prompt explicitly instructs the LLM to cover ALL sections
with DETAILED, HIGH-QUALITY animations - not rushed content.
"""
# Cap sections to a reasonable number for quality
effective_sections = min(section_count, 12)
return f"""Create a DETAILED animated video from this document.
CONTENT TO ANIMATE:
{goal}
CRITICAL REQUIREMENTS:
1. CREATE {effective_sections} DISTINCT SECTIONS - each with its own voiceover block
2. EACH SECTION MUST BE 20-40 SECONDS with rich animations
3. USE VARIED ANIMATIONS: FadeIn, Write, GrowFromCenter, LaggedStart, Indicate, Circumscribe
4. DO NOT RUSH - build visuals progressively in each section
5. CLEAN TRANSITIONS between sections using FadeOut before new content
6. USE THE ACTUAL TEXT from each section as voiceover content
DO NOT:
- Create only 1-2 voiceover blocks
- Rush through in 5 seconds
- Skip middle content
- Use only Write() for everything
VIDEO DURATION: Approximately {effective_sections * 30} seconds total
Each section should have:
- A title/header animation
- Multiple visual elements built progressively
- Emphasis animations (Indicate, Circumscribe)
- Clean transition to next section
"""
|