lsatone

Running on Zero

App Files Files Community

lsatone / backend /keyframes /keyframes_engaging.py

3v324v23

Update Comic123 with local comic folder files

83e35a7 2 months ago

raw

history blame contribute delete

9.27 kB

	"""
	Select the most engaging frames for comic generation
	Focuses on visual quality and storytelling, not showing emotion labels
	"""

	import os
	import cv2
	import srt
	import json # 👈 ADD THIS LINE
	from typing import List, Dict, Tuple
	import numpy as np
	from backend.enhanced_emotion_matcher import EnhancedEmotionMatcher
	from backend.eye_state_detector import EyeStateDetector
	from backend.emotion_aware_comic import FacialExpressionAnalyzer

	def generate_keyframes_engaging(video_path: str, story_subs: List, max_frames: int = 48):
	"""
	Select the most engaging frames for comic generation

	Criteria:
	1. Facial expression matches dialogue mood
	2. Eyes are open (no blinking)
	3. Good composition (face visible, not blurry)
	4. Dramatic/interesting moments
	"""

	print(f"🎬 Selecting most engaging frames for comic generation...")
	print(f"📊 Processing {len(story_subs)} story moments")

	# Initialize analyzers (used internally, not shown to user)
	emotion_matcher = EnhancedEmotionMatcher()
	face_analyzer = FacialExpressionAnalyzer()
	eye_detector = EyeStateDetector()

	# Ensure output directory exists
	final_dir = "frames/final"
	os.makedirs(final_dir, exist_ok=True)

	# Clear existing frames
	for f in os.listdir(final_dir):
	if f.endswith('.png'):
	os.remove(os.path.join(final_dir, f))

	# Open video
	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	print(f"❌ Failed to open video: {video_path}")
	return False

	fps = cap.get(cv2.CAP_PROP_FPS)
	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

	print(f"📹 Analyzing video: {fps:.1f} fps, {total_frames} frames")
	print(f"🔍 Finding best frames for each story moment...")

	# Track frame filename -> original timestamp
	frame_metadata = {}

	# Process each subtitle
	selected_count = 0

	for idx, sub in enumerate(story_subs[:max_frames]):
	# Don't show emotion analysis to user, just use it internally
	text_emotions = emotion_matcher.analyze_text_emotion(sub.content)
	target_mood = max(text_emotions.items(),
	key=lambda x: x[1] if x[0] != 'intensity' else 0)[0]

	# Progress indicator (simple, not technical)
	if idx % 5 == 0:
	print(f" Processing moments {idx+1}-{min(idx+5, len(story_subs))}...")

	# Find the most engaging frame for this moment
	best_frame = find_most_engaging_frame(
	cap, sub, fps,
	face_analyzer, eye_detector,
	target_mood, text_emotions
	)

	if best_frame is not None:
	# Save the selected frame with consistent naming
	filename = f"frame_{selected_count:03d}.png"
	output_path = os.path.join(final_dir, filename)

	# Apply any visual enhancements for comic style
	enhanced_frame = enhance_for_comic(best_frame['image'])
	cv2.imwrite(output_path, enhanced_frame)

	# Store original timestamp (midpoint of subtitle)
	original_timestamp = sub.start.total_seconds() + (sub.end.total_seconds() - sub.start.total_seconds()) / 2
	frame_metadata[filename] = original_timestamp

	selected_count += 1
	else:
	# Fallback: get a decent frame from the middle
	fallback_frame = get_decent_frame(cap, sub, fps)
	if fallback_frame is not None:
	filename = f"frame_{selected_count:03d}.png"
	output_path = os.path.join(final_dir, filename)
	enhanced_frame = enhance_for_comic(fallback_frame)
	cv2.imwrite(output_path, enhanced_frame)

	# Store fallback timestamp
	original_timestamp = sub.start.total_seconds() + (sub.end.total_seconds() - sub.start.total_seconds()) / 2
	frame_metadata[filename] = original_timestamp

	selected_count += 1

	cap.release()

	# Save metadata for regeneration (critical for video-based regenerate)
	with open("frames/frame_metadata.json", "w") as f:
	json.dump(frame_metadata, f, indent=2)

	print(f"\n✅ Selected {selected_count} engaging frames for comic")
	print(f"📁 Frames saved to: {final_dir}")
	print(f"💾 Frame metadata saved to: frames/frame_metadata.json")

	return selected_count > 0


	def find_most_engaging_frame(cap, subtitle, fps, face_analyzer, eye_detector,
	target_mood, text_emotions):
	"""
	Find the most visually engaging frame for this subtitle

	Scoring based on:
	- Expression matching dialogue (internal, not shown)
	- Eye quality (open, alert)
	- Visual composition
	- Sharpness/clarity
	"""

	# Time window to search
	start_time = subtitle.start.total_seconds()
	end_time = subtitle.end.total_seconds()
	duration = end_time - start_time

	# Extend search window slightly for better options
	search_start = max(0, start_time - 0.5)
	search_end = end_time + 0.5

	start_frame = int(search_start * fps)
	end_frame = int(search_end * fps)

	# Sample frames intelligently
	num_samples = min(15, end_frame - start_frame)
	if num_samples <= 0:
	num_samples = 5

	frame_step = max(1, (end_frame - start_frame) // num_samples)

	best_frame = None
	best_score = -1

	for frame_num in range(start_frame, end_frame, frame_step):
	cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
	ret, frame = cap.read()

	if not ret or frame is None:
	continue

	# Calculate engagement score
	score = calculate_engagement_score(
	frame, face_analyzer, eye_detector,
	target_mood, text_emotions
	)

	if score > best_score:
	best_score = score
	best_frame = {
	'image': frame.copy(),
	'score': score,
	'frame_num': frame_num
	}

	return best_frame


	def calculate_engagement_score(frame, face_analyzer, eye_detector,
	target_mood, text_emotions):
	"""
	Calculate how engaging/suitable this frame is for the comic

	High scores for:
	- Good facial expressions
	- Open eyes
	- Clear image
	- Good composition
	"""

	score = 0.0

	# Save temp for analysis
	temp_path = "temp_frame_analysis.png"
	cv2.imwrite(temp_path, frame)

	try:
	# 1. Eye quality (most important for comics)
	eye_state = eye_detector.check_eyes_state(temp_path)
	if eye_state['state'] == 'open':
	score += 3.0
	elif eye_state['state'] == 'partially_open':
	score += 1.5
	elif eye_state['state'] == 'unknown':
	score += 1.0 # No face, might be okay
	else: # closed or half_closed
	score += 0.0 # Strong penalty

	# 2. Expression quality (internal matching)
	face_emotions = face_analyzer.analyze_expression(temp_path)

	# Check if expression matches mood
	if target_mood in face_emotions and face_emotions[target_mood] > 0.3:
	score += 2.0 * face_emotions[target_mood]

	# General expressiveness (any strong emotion is interesting)
	max_emotion = max(face_emotions.values())
	if max_emotion > 0.5:
	score += 1.0

	# 3. Image quality
	sharpness = calculate_sharpness(frame)
	score += sharpness * 0.5

	# 4. Composition (face detection confidence)
	if eye_state.get('confidence', 0) > 0.7:
	score += 0.5

	finally:
	# Clean up
	if os.path.exists(temp_path):
	os.remove(temp_path)

	return score


	def calculate_sharpness(frame):
	"""Calculate image sharpness using Laplacian variance"""
	gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
	laplacian = cv2.Laplacian(gray, cv2.CV_64F)
	variance = laplacian.var()

	# Normalize to 0-1 range
	normalized = min(variance / 500.0, 1.0)
	return normalized


	def enhance_for_comic(frame):
	"""Apply subtle enhancements to make frame more comic-like"""
	lab = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB)
	l, a, b = cv2.split(lab)
	clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
	l = clahe.apply(l)
	enhanced = cv2.merge([l, a, b])
	enhanced = cv2.cvtColor(enhanced, cv2.COLOR_LAB2BGR)
	return enhanced


	def get_decent_frame(cap, subtitle, fps):
	"""Get a decent fallback frame"""
	positions = [0.5, 0.3, 0.7, 0.2, 0.8]
	duration = subtitle.end.total_seconds() - subtitle.start.total_seconds()
	for pos in positions:
	time_offset = subtitle.start.total_seconds() + (duration * pos)
	frame_num = int(time_offset * fps)
	cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
	ret, frame = cap.read()
	if ret and frame is not None:
	if calculate_sharpness(frame) > 0.3:
	return frame
	return None