File size: 9,271 Bytes
83e35a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 |
"""
Select the most engaging frames for comic generation
Focuses on visual quality and storytelling, not showing emotion labels
"""
import os
import cv2
import srt
import json # π ADD THIS LINE
from typing import List, Dict, Tuple
import numpy as np
from backend.enhanced_emotion_matcher import EnhancedEmotionMatcher
from backend.eye_state_detector import EyeStateDetector
from backend.emotion_aware_comic import FacialExpressionAnalyzer
def generate_keyframes_engaging(video_path: str, story_subs: List, max_frames: int = 48):
"""
Select the most engaging frames for comic generation
Criteria:
1. Facial expression matches dialogue mood
2. Eyes are open (no blinking)
3. Good composition (face visible, not blurry)
4. Dramatic/interesting moments
"""
print(f"π¬ Selecting most engaging frames for comic generation...")
print(f"π Processing {len(story_subs)} story moments")
# Initialize analyzers (used internally, not shown to user)
emotion_matcher = EnhancedEmotionMatcher()
face_analyzer = FacialExpressionAnalyzer()
eye_detector = EyeStateDetector()
# Ensure output directory exists
final_dir = "frames/final"
os.makedirs(final_dir, exist_ok=True)
# Clear existing frames
for f in os.listdir(final_dir):
if f.endswith('.png'):
os.remove(os.path.join(final_dir, f))
# Open video
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"β Failed to open video: {video_path}")
return False
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"πΉ Analyzing video: {fps:.1f} fps, {total_frames} frames")
print(f"π Finding best frames for each story moment...")
# Track frame filename -> original timestamp
frame_metadata = {}
# Process each subtitle
selected_count = 0
for idx, sub in enumerate(story_subs[:max_frames]):
# Don't show emotion analysis to user, just use it internally
text_emotions = emotion_matcher.analyze_text_emotion(sub.content)
target_mood = max(text_emotions.items(),
key=lambda x: x[1] if x[0] != 'intensity' else 0)[0]
# Progress indicator (simple, not technical)
if idx % 5 == 0:
print(f" Processing moments {idx+1}-{min(idx+5, len(story_subs))}...")
# Find the most engaging frame for this moment
best_frame = find_most_engaging_frame(
cap, sub, fps,
face_analyzer, eye_detector,
target_mood, text_emotions
)
if best_frame is not None:
# Save the selected frame with consistent naming
filename = f"frame_{selected_count:03d}.png"
output_path = os.path.join(final_dir, filename)
# Apply any visual enhancements for comic style
enhanced_frame = enhance_for_comic(best_frame['image'])
cv2.imwrite(output_path, enhanced_frame)
# Store original timestamp (midpoint of subtitle)
original_timestamp = sub.start.total_seconds() + (sub.end.total_seconds() - sub.start.total_seconds()) / 2
frame_metadata[filename] = original_timestamp
selected_count += 1
else:
# Fallback: get a decent frame from the middle
fallback_frame = get_decent_frame(cap, sub, fps)
if fallback_frame is not None:
filename = f"frame_{selected_count:03d}.png"
output_path = os.path.join(final_dir, filename)
enhanced_frame = enhance_for_comic(fallback_frame)
cv2.imwrite(output_path, enhanced_frame)
# Store fallback timestamp
original_timestamp = sub.start.total_seconds() + (sub.end.total_seconds() - sub.start.total_seconds()) / 2
frame_metadata[filename] = original_timestamp
selected_count += 1
cap.release()
# Save metadata for regeneration (critical for video-based regenerate)
with open("frames/frame_metadata.json", "w") as f:
json.dump(frame_metadata, f, indent=2)
print(f"\nβ
Selected {selected_count} engaging frames for comic")
print(f"π Frames saved to: {final_dir}")
print(f"πΎ Frame metadata saved to: frames/frame_metadata.json")
return selected_count > 0
def find_most_engaging_frame(cap, subtitle, fps, face_analyzer, eye_detector,
target_mood, text_emotions):
"""
Find the most visually engaging frame for this subtitle
Scoring based on:
- Expression matching dialogue (internal, not shown)
- Eye quality (open, alert)
- Visual composition
- Sharpness/clarity
"""
# Time window to search
start_time = subtitle.start.total_seconds()
end_time = subtitle.end.total_seconds()
duration = end_time - start_time
# Extend search window slightly for better options
search_start = max(0, start_time - 0.5)
search_end = end_time + 0.5
start_frame = int(search_start * fps)
end_frame = int(search_end * fps)
# Sample frames intelligently
num_samples = min(15, end_frame - start_frame)
if num_samples <= 0:
num_samples = 5
frame_step = max(1, (end_frame - start_frame) // num_samples)
best_frame = None
best_score = -1
for frame_num in range(start_frame, end_frame, frame_step):
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if not ret or frame is None:
continue
# Calculate engagement score
score = calculate_engagement_score(
frame, face_analyzer, eye_detector,
target_mood, text_emotions
)
if score > best_score:
best_score = score
best_frame = {
'image': frame.copy(),
'score': score,
'frame_num': frame_num
}
return best_frame
def calculate_engagement_score(frame, face_analyzer, eye_detector,
target_mood, text_emotions):
"""
Calculate how engaging/suitable this frame is for the comic
High scores for:
- Good facial expressions
- Open eyes
- Clear image
- Good composition
"""
score = 0.0
# Save temp for analysis
temp_path = "temp_frame_analysis.png"
cv2.imwrite(temp_path, frame)
try:
# 1. Eye quality (most important for comics)
eye_state = eye_detector.check_eyes_state(temp_path)
if eye_state['state'] == 'open':
score += 3.0
elif eye_state['state'] == 'partially_open':
score += 1.5
elif eye_state['state'] == 'unknown':
score += 1.0 # No face, might be okay
else: # closed or half_closed
score += 0.0 # Strong penalty
# 2. Expression quality (internal matching)
face_emotions = face_analyzer.analyze_expression(temp_path)
# Check if expression matches mood
if target_mood in face_emotions and face_emotions[target_mood] > 0.3:
score += 2.0 * face_emotions[target_mood]
# General expressiveness (any strong emotion is interesting)
max_emotion = max(face_emotions.values())
if max_emotion > 0.5:
score += 1.0
# 3. Image quality
sharpness = calculate_sharpness(frame)
score += sharpness * 0.5
# 4. Composition (face detection confidence)
if eye_state.get('confidence', 0) > 0.7:
score += 0.5
finally:
# Clean up
if os.path.exists(temp_path):
os.remove(temp_path)
return score
def calculate_sharpness(frame):
"""Calculate image sharpness using Laplacian variance"""
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
laplacian = cv2.Laplacian(gray, cv2.CV_64F)
variance = laplacian.var()
# Normalize to 0-1 range
normalized = min(variance / 500.0, 1.0)
return normalized
def enhance_for_comic(frame):
"""Apply subtle enhancements to make frame more comic-like"""
lab = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB)
l, a, b = cv2.split(lab)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
l = clahe.apply(l)
enhanced = cv2.merge([l, a, b])
enhanced = cv2.cvtColor(enhanced, cv2.COLOR_LAB2BGR)
return enhanced
def get_decent_frame(cap, subtitle, fps):
"""Get a decent fallback frame"""
positions = [0.5, 0.3, 0.7, 0.2, 0.8]
duration = subtitle.end.total_seconds() - subtitle.start.total_seconds()
for pos in positions:
time_offset = subtitle.start.total_seconds() + (duration * pos)
frame_num = int(time_offset * fps)
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if ret and frame is not None:
if calculate_sharpness(frame) > 0.3:
return frame
return None
|