Spaces:

aditya2001
/

VidSimplify

Running

VidSimplify / manimator /utils /content_preprocessor.py

Adityahulk

adding pdf parsing logic correctly

3ccc955 13 days ago

5.75 kB

	"""
	Content Preprocessor for Long Inputs

	Handles very long content (PDFs, large text) by:
	1. Chunking content into logical sections
	2. Numbering sections for explicit coverage
	3. Ensuring proportional representation in the video
	"""

	import logging
	import re
	from typing import List, Tuple

	logger = logging.getLogger(__name__)


	def chunk_content(content: str, max_words_per_chunk: int = 150) -> List[str]:
	"""
	Split content into logical chunks based on paragraphs and sentences.

	Args:
	content: The full text content
	max_words_per_chunk: Target words per chunk (will be approximate)

	Returns:
	List of content chunks
	"""
	# First, split by double newlines (paragraphs)
	paragraphs = re.split(r'\n\s*\n', content.strip())
	paragraphs = [p.strip() for p in paragraphs if p.strip()]

	chunks = []
	current_chunk = []
	current_word_count = 0

	for para in paragraphs:
	para_words = len(para.split())

	# If paragraph itself is too long, split by sentences
	if para_words > max_words_per_chunk:
	# Commit current chunk first
	if current_chunk:
	chunks.append(' '.join(current_chunk))
	current_chunk = []
	current_word_count = 0

	# Split paragraph by sentences
	sentences = re.split(r'(?<=[.!?])\s+', para)
	temp_chunk = []
	temp_count = 0

	for sentence in sentences:
	sent_words = len(sentence.split())
	if temp_count + sent_words > max_words_per_chunk and temp_chunk:
	chunks.append(' '.join(temp_chunk))
	temp_chunk = [sentence]
	temp_count = sent_words
	else:
	temp_chunk.append(sentence)
	temp_count += sent_words

	if temp_chunk:
	chunks.append(' '.join(temp_chunk))
	else:
	# Normal paragraph - add to current chunk
	if current_word_count + para_words > max_words_per_chunk and current_chunk:
	chunks.append(' '.join(current_chunk))
	current_chunk = [para]
	current_word_count = para_words
	else:
	current_chunk.append(para)
	current_word_count += para_words

	# Don't forget the last chunk
	if current_chunk:
	chunks.append(' '.join(current_chunk))

	return chunks


	def preprocess_long_content(content: str) -> Tuple[str, int]:
	"""
	Preprocess long content by chunking and adding section markers.

	For very long content (>1000 words), this creates a structured format
	with numbered sections that the LLM MUST cover proportionally.

	Args:
	content: The raw content from PDF/text input

	Returns:
	Tuple of (processed_content, section_count)
	"""
	word_count = len(content.split())

	# For shorter content, return as-is
	if word_count <= 1000:
	return content, 0

	logger.info(f"📄 Preprocessing very long content: {word_count} words")

	# Calculate appropriate chunk size based on content length
	# Longer content = smaller chunks to ensure coverage
	if word_count > 5000:
	max_words = 120 # Very long - more sections
	elif word_count > 3000:
	max_words = 150
	elif word_count > 2000:
	max_words = 180
	else:
	max_words = 200

	chunks = chunk_content(content, max_words_per_chunk=max_words)
	section_count = len(chunks)

	logger.info(f"📄 Split into {section_count} sections (avg ~{word_count // section_count} words each)")

	# Create structured content with numbered sections
	structured_parts = []
	structured_parts.append(f"# STRUCTURED CONTENT ({section_count} SECTIONS)")
	structured_parts.append(f"# YOU MUST CREATE A VOICEOVER BLOCK FOR EACH SECTION BELOW")
	structured_parts.append(f"# Video should cover ALL {section_count} sections proportionally")
	structured_parts.append("")

	for i, chunk in enumerate(chunks, 1):
	structured_parts.append(f"=== SECTION {i} OF {section_count} ===")
	structured_parts.append(chunk)
	structured_parts.append("")

	return '\n'.join(structured_parts), section_count


	def get_script_mode_prompt_for_long_content(goal: str, section_count: int) -> str:
	"""
	Generate the user prompt for very long (chunked) content.

	This prompt explicitly instructs the LLM to cover ALL sections
	with DETAILED, HIGH-QUALITY animations - not rushed content.
	"""
	# Cap sections to a reasonable number for quality
	effective_sections = min(section_count, 12)

	return f"""Create a DETAILED animated video from this document.

	CONTENT TO ANIMATE:
	{goal}

	CRITICAL REQUIREMENTS:

	1. CREATE {effective_sections} DISTINCT SECTIONS - each with its own voiceover block
	2. EACH SECTION MUST BE 20-40 SECONDS with rich animations
	3. USE VARIED ANIMATIONS: FadeIn, Write, GrowFromCenter, LaggedStart, Indicate, Circumscribe
	4. DO NOT RUSH - build visuals progressively in each section
	5. CLEAN TRANSITIONS between sections using FadeOut before new content
	6. USE THE ACTUAL TEXT from each section as voiceover content

	DO NOT:
	- Create only 1-2 voiceover blocks
	- Rush through in 5 seconds
	- Skip middle content
	- Use only Write() for everything

	VIDEO DURATION: Approximately {effective_sections * 30} seconds total

	Each section should have:
	- A title/header animation
	- Multiple visual elements built progressively
	- Emphasis animations (Indicate, Circumscribe)
	- Clean transition to next section
	"""