|
|
""" |
|
|
Select the most engaging frames for comic generation |
|
|
Focuses on visual quality and storytelling, not showing emotion labels |
|
|
""" |
|
|
|
|
|
import os |
|
|
import cv2 |
|
|
import srt |
|
|
import json |
|
|
from typing import List, Dict, Tuple |
|
|
import numpy as np |
|
|
from backend.enhanced_emotion_matcher import EnhancedEmotionMatcher |
|
|
from backend.eye_state_detector import EyeStateDetector |
|
|
from backend.emotion_aware_comic import FacialExpressionAnalyzer |
|
|
|
|
|
def generate_keyframes_engaging(video_path: str, story_subs: List, max_frames: int = 48): |
|
|
""" |
|
|
Select the most engaging frames for comic generation |
|
|
|
|
|
Criteria: |
|
|
1. Facial expression matches dialogue mood |
|
|
2. Eyes are open (no blinking) |
|
|
3. Good composition (face visible, not blurry) |
|
|
4. Dramatic/interesting moments |
|
|
""" |
|
|
|
|
|
print(f"🎬 Selecting most engaging frames for comic generation...") |
|
|
print(f"📊 Processing {len(story_subs)} story moments") |
|
|
|
|
|
|
|
|
emotion_matcher = EnhancedEmotionMatcher() |
|
|
face_analyzer = FacialExpressionAnalyzer() |
|
|
eye_detector = EyeStateDetector() |
|
|
|
|
|
|
|
|
final_dir = "frames/final" |
|
|
os.makedirs(final_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
for f in os.listdir(final_dir): |
|
|
if f.endswith('.png'): |
|
|
os.remove(os.path.join(final_dir, f)) |
|
|
|
|
|
|
|
|
cap = cv2.VideoCapture(video_path) |
|
|
if not cap.isOpened(): |
|
|
print(f"❌ Failed to open video: {video_path}") |
|
|
return False |
|
|
|
|
|
fps = cap.get(cv2.CAP_PROP_FPS) |
|
|
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
|
|
|
|
print(f"📹 Analyzing video: {fps:.1f} fps, {total_frames} frames") |
|
|
print(f"🔍 Finding best frames for each story moment...") |
|
|
|
|
|
|
|
|
frame_metadata = {} |
|
|
|
|
|
|
|
|
selected_count = 0 |
|
|
|
|
|
for idx, sub in enumerate(story_subs[:max_frames]): |
|
|
|
|
|
text_emotions = emotion_matcher.analyze_text_emotion(sub.content) |
|
|
target_mood = max(text_emotions.items(), |
|
|
key=lambda x: x[1] if x[0] != 'intensity' else 0)[0] |
|
|
|
|
|
|
|
|
if idx % 5 == 0: |
|
|
print(f" Processing moments {idx+1}-{min(idx+5, len(story_subs))}...") |
|
|
|
|
|
|
|
|
best_frame = find_most_engaging_frame( |
|
|
cap, sub, fps, |
|
|
face_analyzer, eye_detector, |
|
|
target_mood, text_emotions |
|
|
) |
|
|
|
|
|
if best_frame is not None: |
|
|
|
|
|
filename = f"frame_{selected_count:03d}.png" |
|
|
output_path = os.path.join(final_dir, filename) |
|
|
|
|
|
|
|
|
enhanced_frame = enhance_for_comic(best_frame['image']) |
|
|
cv2.imwrite(output_path, enhanced_frame) |
|
|
|
|
|
|
|
|
original_timestamp = sub.start.total_seconds() + (sub.end.total_seconds() - sub.start.total_seconds()) / 2 |
|
|
frame_metadata[filename] = original_timestamp |
|
|
|
|
|
selected_count += 1 |
|
|
else: |
|
|
|
|
|
fallback_frame = get_decent_frame(cap, sub, fps) |
|
|
if fallback_frame is not None: |
|
|
filename = f"frame_{selected_count:03d}.png" |
|
|
output_path = os.path.join(final_dir, filename) |
|
|
enhanced_frame = enhance_for_comic(fallback_frame) |
|
|
cv2.imwrite(output_path, enhanced_frame) |
|
|
|
|
|
|
|
|
original_timestamp = sub.start.total_seconds() + (sub.end.total_seconds() - sub.start.total_seconds()) / 2 |
|
|
frame_metadata[filename] = original_timestamp |
|
|
|
|
|
selected_count += 1 |
|
|
|
|
|
cap.release() |
|
|
|
|
|
|
|
|
with open("frames/frame_metadata.json", "w") as f: |
|
|
json.dump(frame_metadata, f, indent=2) |
|
|
|
|
|
print(f"\n✅ Selected {selected_count} engaging frames for comic") |
|
|
print(f"📁 Frames saved to: {final_dir}") |
|
|
print(f"💾 Frame metadata saved to: frames/frame_metadata.json") |
|
|
|
|
|
return selected_count > 0 |
|
|
|
|
|
|
|
|
def find_most_engaging_frame(cap, subtitle, fps, face_analyzer, eye_detector, |
|
|
target_mood, text_emotions): |
|
|
""" |
|
|
Find the most visually engaging frame for this subtitle |
|
|
|
|
|
Scoring based on: |
|
|
- Expression matching dialogue (internal, not shown) |
|
|
- Eye quality (open, alert) |
|
|
- Visual composition |
|
|
- Sharpness/clarity |
|
|
""" |
|
|
|
|
|
|
|
|
start_time = subtitle.start.total_seconds() |
|
|
end_time = subtitle.end.total_seconds() |
|
|
duration = end_time - start_time |
|
|
|
|
|
|
|
|
search_start = max(0, start_time - 0.5) |
|
|
search_end = end_time + 0.5 |
|
|
|
|
|
start_frame = int(search_start * fps) |
|
|
end_frame = int(search_end * fps) |
|
|
|
|
|
|
|
|
num_samples = min(15, end_frame - start_frame) |
|
|
if num_samples <= 0: |
|
|
num_samples = 5 |
|
|
|
|
|
frame_step = max(1, (end_frame - start_frame) // num_samples) |
|
|
|
|
|
best_frame = None |
|
|
best_score = -1 |
|
|
|
|
|
for frame_num in range(start_frame, end_frame, frame_step): |
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num) |
|
|
ret, frame = cap.read() |
|
|
|
|
|
if not ret or frame is None: |
|
|
continue |
|
|
|
|
|
|
|
|
score = calculate_engagement_score( |
|
|
frame, face_analyzer, eye_detector, |
|
|
target_mood, text_emotions |
|
|
) |
|
|
|
|
|
if score > best_score: |
|
|
best_score = score |
|
|
best_frame = { |
|
|
'image': frame.copy(), |
|
|
'score': score, |
|
|
'frame_num': frame_num |
|
|
} |
|
|
|
|
|
return best_frame |
|
|
|
|
|
|
|
|
def calculate_engagement_score(frame, face_analyzer, eye_detector, |
|
|
target_mood, text_emotions): |
|
|
""" |
|
|
Calculate how engaging/suitable this frame is for the comic |
|
|
|
|
|
High scores for: |
|
|
- Good facial expressions |
|
|
- Open eyes |
|
|
- Clear image |
|
|
- Good composition |
|
|
""" |
|
|
|
|
|
score = 0.0 |
|
|
|
|
|
|
|
|
temp_path = "temp_frame_analysis.png" |
|
|
cv2.imwrite(temp_path, frame) |
|
|
|
|
|
try: |
|
|
|
|
|
eye_state = eye_detector.check_eyes_state(temp_path) |
|
|
if eye_state['state'] == 'open': |
|
|
score += 3.0 |
|
|
elif eye_state['state'] == 'partially_open': |
|
|
score += 1.5 |
|
|
elif eye_state['state'] == 'unknown': |
|
|
score += 1.0 |
|
|
else: |
|
|
score += 0.0 |
|
|
|
|
|
|
|
|
face_emotions = face_analyzer.analyze_expression(temp_path) |
|
|
|
|
|
|
|
|
if target_mood in face_emotions and face_emotions[target_mood] > 0.3: |
|
|
score += 2.0 * face_emotions[target_mood] |
|
|
|
|
|
|
|
|
max_emotion = max(face_emotions.values()) |
|
|
if max_emotion > 0.5: |
|
|
score += 1.0 |
|
|
|
|
|
|
|
|
sharpness = calculate_sharpness(frame) |
|
|
score += sharpness * 0.5 |
|
|
|
|
|
|
|
|
if eye_state.get('confidence', 0) > 0.7: |
|
|
score += 0.5 |
|
|
|
|
|
finally: |
|
|
|
|
|
if os.path.exists(temp_path): |
|
|
os.remove(temp_path) |
|
|
|
|
|
return score |
|
|
|
|
|
|
|
|
def calculate_sharpness(frame): |
|
|
"""Calculate image sharpness using Laplacian variance""" |
|
|
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) |
|
|
laplacian = cv2.Laplacian(gray, cv2.CV_64F) |
|
|
variance = laplacian.var() |
|
|
|
|
|
|
|
|
normalized = min(variance / 500.0, 1.0) |
|
|
return normalized |
|
|
|
|
|
|
|
|
def enhance_for_comic(frame): |
|
|
"""Apply subtle enhancements to make frame more comic-like""" |
|
|
lab = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB) |
|
|
l, a, b = cv2.split(lab) |
|
|
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) |
|
|
l = clahe.apply(l) |
|
|
enhanced = cv2.merge([l, a, b]) |
|
|
enhanced = cv2.cvtColor(enhanced, cv2.COLOR_LAB2BGR) |
|
|
return enhanced |
|
|
|
|
|
|
|
|
def get_decent_frame(cap, subtitle, fps): |
|
|
"""Get a decent fallback frame""" |
|
|
positions = [0.5, 0.3, 0.7, 0.2, 0.8] |
|
|
duration = subtitle.end.total_seconds() - subtitle.start.total_seconds() |
|
|
for pos in positions: |
|
|
time_offset = subtitle.start.total_seconds() + (duration * pos) |
|
|
frame_num = int(time_offset * fps) |
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num) |
|
|
ret, frame = cap.read() |
|
|
if ret and frame is not None: |
|
|
if calculate_sharpness(frame) > 0.3: |
|
|
return frame |
|
|
return None |
|
|
|