lsatone / backend /keyframes /keyframes_engaging.py
3v324v23's picture
Update Comic123 with local comic folder files
83e35a7
"""
Select the most engaging frames for comic generation
Focuses on visual quality and storytelling, not showing emotion labels
"""
import os
import cv2
import srt
import json # 👈 ADD THIS LINE
from typing import List, Dict, Tuple
import numpy as np
from backend.enhanced_emotion_matcher import EnhancedEmotionMatcher
from backend.eye_state_detector import EyeStateDetector
from backend.emotion_aware_comic import FacialExpressionAnalyzer
def generate_keyframes_engaging(video_path: str, story_subs: List, max_frames: int = 48):
"""
Select the most engaging frames for comic generation
Criteria:
1. Facial expression matches dialogue mood
2. Eyes are open (no blinking)
3. Good composition (face visible, not blurry)
4. Dramatic/interesting moments
"""
print(f"🎬 Selecting most engaging frames for comic generation...")
print(f"📊 Processing {len(story_subs)} story moments")
# Initialize analyzers (used internally, not shown to user)
emotion_matcher = EnhancedEmotionMatcher()
face_analyzer = FacialExpressionAnalyzer()
eye_detector = EyeStateDetector()
# Ensure output directory exists
final_dir = "frames/final"
os.makedirs(final_dir, exist_ok=True)
# Clear existing frames
for f in os.listdir(final_dir):
if f.endswith('.png'):
os.remove(os.path.join(final_dir, f))
# Open video
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"❌ Failed to open video: {video_path}")
return False
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"📹 Analyzing video: {fps:.1f} fps, {total_frames} frames")
print(f"🔍 Finding best frames for each story moment...")
# Track frame filename -> original timestamp
frame_metadata = {}
# Process each subtitle
selected_count = 0
for idx, sub in enumerate(story_subs[:max_frames]):
# Don't show emotion analysis to user, just use it internally
text_emotions = emotion_matcher.analyze_text_emotion(sub.content)
target_mood = max(text_emotions.items(),
key=lambda x: x[1] if x[0] != 'intensity' else 0)[0]
# Progress indicator (simple, not technical)
if idx % 5 == 0:
print(f" Processing moments {idx+1}-{min(idx+5, len(story_subs))}...")
# Find the most engaging frame for this moment
best_frame = find_most_engaging_frame(
cap, sub, fps,
face_analyzer, eye_detector,
target_mood, text_emotions
)
if best_frame is not None:
# Save the selected frame with consistent naming
filename = f"frame_{selected_count:03d}.png"
output_path = os.path.join(final_dir, filename)
# Apply any visual enhancements for comic style
enhanced_frame = enhance_for_comic(best_frame['image'])
cv2.imwrite(output_path, enhanced_frame)
# Store original timestamp (midpoint of subtitle)
original_timestamp = sub.start.total_seconds() + (sub.end.total_seconds() - sub.start.total_seconds()) / 2
frame_metadata[filename] = original_timestamp
selected_count += 1
else:
# Fallback: get a decent frame from the middle
fallback_frame = get_decent_frame(cap, sub, fps)
if fallback_frame is not None:
filename = f"frame_{selected_count:03d}.png"
output_path = os.path.join(final_dir, filename)
enhanced_frame = enhance_for_comic(fallback_frame)
cv2.imwrite(output_path, enhanced_frame)
# Store fallback timestamp
original_timestamp = sub.start.total_seconds() + (sub.end.total_seconds() - sub.start.total_seconds()) / 2
frame_metadata[filename] = original_timestamp
selected_count += 1
cap.release()
# Save metadata for regeneration (critical for video-based regenerate)
with open("frames/frame_metadata.json", "w") as f:
json.dump(frame_metadata, f, indent=2)
print(f"\n✅ Selected {selected_count} engaging frames for comic")
print(f"📁 Frames saved to: {final_dir}")
print(f"💾 Frame metadata saved to: frames/frame_metadata.json")
return selected_count > 0
def find_most_engaging_frame(cap, subtitle, fps, face_analyzer, eye_detector,
target_mood, text_emotions):
"""
Find the most visually engaging frame for this subtitle
Scoring based on:
- Expression matching dialogue (internal, not shown)
- Eye quality (open, alert)
- Visual composition
- Sharpness/clarity
"""
# Time window to search
start_time = subtitle.start.total_seconds()
end_time = subtitle.end.total_seconds()
duration = end_time - start_time
# Extend search window slightly for better options
search_start = max(0, start_time - 0.5)
search_end = end_time + 0.5
start_frame = int(search_start * fps)
end_frame = int(search_end * fps)
# Sample frames intelligently
num_samples = min(15, end_frame - start_frame)
if num_samples <= 0:
num_samples = 5
frame_step = max(1, (end_frame - start_frame) // num_samples)
best_frame = None
best_score = -1
for frame_num in range(start_frame, end_frame, frame_step):
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if not ret or frame is None:
continue
# Calculate engagement score
score = calculate_engagement_score(
frame, face_analyzer, eye_detector,
target_mood, text_emotions
)
if score > best_score:
best_score = score
best_frame = {
'image': frame.copy(),
'score': score,
'frame_num': frame_num
}
return best_frame
def calculate_engagement_score(frame, face_analyzer, eye_detector,
target_mood, text_emotions):
"""
Calculate how engaging/suitable this frame is for the comic
High scores for:
- Good facial expressions
- Open eyes
- Clear image
- Good composition
"""
score = 0.0
# Save temp for analysis
temp_path = "temp_frame_analysis.png"
cv2.imwrite(temp_path, frame)
try:
# 1. Eye quality (most important for comics)
eye_state = eye_detector.check_eyes_state(temp_path)
if eye_state['state'] == 'open':
score += 3.0
elif eye_state['state'] == 'partially_open':
score += 1.5
elif eye_state['state'] == 'unknown':
score += 1.0 # No face, might be okay
else: # closed or half_closed
score += 0.0 # Strong penalty
# 2. Expression quality (internal matching)
face_emotions = face_analyzer.analyze_expression(temp_path)
# Check if expression matches mood
if target_mood in face_emotions and face_emotions[target_mood] > 0.3:
score += 2.0 * face_emotions[target_mood]
# General expressiveness (any strong emotion is interesting)
max_emotion = max(face_emotions.values())
if max_emotion > 0.5:
score += 1.0
# 3. Image quality
sharpness = calculate_sharpness(frame)
score += sharpness * 0.5
# 4. Composition (face detection confidence)
if eye_state.get('confidence', 0) > 0.7:
score += 0.5
finally:
# Clean up
if os.path.exists(temp_path):
os.remove(temp_path)
return score
def calculate_sharpness(frame):
"""Calculate image sharpness using Laplacian variance"""
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
laplacian = cv2.Laplacian(gray, cv2.CV_64F)
variance = laplacian.var()
# Normalize to 0-1 range
normalized = min(variance / 500.0, 1.0)
return normalized
def enhance_for_comic(frame):
"""Apply subtle enhancements to make frame more comic-like"""
lab = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB)
l, a, b = cv2.split(lab)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
l = clahe.apply(l)
enhanced = cv2.merge([l, a, b])
enhanced = cv2.cvtColor(enhanced, cv2.COLOR_LAB2BGR)
return enhanced
def get_decent_frame(cap, subtitle, fps):
"""Get a decent fallback frame"""
positions = [0.5, 0.3, 0.7, 0.2, 0.8]
duration = subtitle.end.total_seconds() - subtitle.start.total_seconds()
for pos in positions:
time_offset = subtitle.start.total_seconds() + (duration * pos)
frame_num = int(time_offset * fps)
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if ret and frame is not None:
if calculate_sharpness(frame) > 0.3:
return frame
return None