File size: 9,548 Bytes
83e35a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 |
"""
Emotion-based keyframe selection - analyzes emotions FIRST, then selects matching frames
"""
import os
import cv2
import srt
from typing import List, Dict, Tuple
import numpy as np
from backend.enhanced_emotion_matcher import EnhancedEmotionMatcher
from backend.eye_state_detector import EyeStateDetector
from backend.emotion_aware_comic import FacialExpressionAnalyzer
def generate_keyframes_emotion_based(video_path: str, story_subs: List, max_frames: int = 48):
"""
Generate keyframes by matching facial expressions to dialogue emotions
This analyzes emotions FIRST, then finds the best matching frames
"""
print(f"π Emotion-Based Frame Selection (Analyzing emotions BEFORE frame selection)")
print(f"π Analyzing {len(story_subs)} dialogues for emotions...")
# Initialize analyzers
emotion_matcher = EnhancedEmotionMatcher()
face_analyzer = FacialExpressionAnalyzer()
eye_detector = EyeStateDetector()
# Step 1: Analyze all dialogue emotions first
dialogue_emotions = []
for i, sub in enumerate(story_subs[:max_frames]):
text_emotions = emotion_matcher.analyze_text_emotion(sub.content)
dominant_emotion = max(text_emotions.items(),
key=lambda x: x[1] if x[0] != 'intensity' else 0)[0]
dialogue_emotions.append({
'subtitle': sub,
'text': sub.content,
'emotions': text_emotions,
'dominant': dominant_emotion,
'start_time': sub.start.total_seconds(),
'end_time': sub.end.total_seconds()
})
print(f" π Dialogue {i+1}: '{sub.content[:40]}...' β {dominant_emotion}")
print(f"\n㪠Scanning video for matching facial expressions...")
# Ensure output directory exists
final_dir = "frames/final"
os.makedirs(final_dir, exist_ok=True)
# Clear existing frames
for f in os.listdir(final_dir):
if f.endswith('.png'):
os.remove(os.path.join(final_dir, f))
# Open video
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"β Failed to open video: {video_path}")
return False
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"πΉ Video: {fps} fps, {total_frames} total frames")
# Step 2: For each dialogue, find the best matching frame
selected_frames = []
for idx, dialogue_data in enumerate(dialogue_emotions):
print(f"\nπ Finding best frame for dialogue {idx+1}: {dialogue_data['dominant']} emotion")
best_frame = find_best_emotional_frame(
cap, dialogue_data, fps,
face_analyzer, eye_detector,
scan_window=2.0 # Scan 2 seconds around dialogue
)
if best_frame is not None:
# Save the selected frame
output_path = os.path.join(final_dir, f"frame{idx:03d}.png")
cv2.imwrite(output_path, best_frame['image'])
selected_frames.append({
'path': output_path,
'dialogue': dialogue_data,
'face_emotion': best_frame['face_emotion'],
'match_score': best_frame['match_score'],
'eye_state': best_frame['eye_state']
})
print(f" β
Selected frame with {best_frame['face_emotion']} face " +
f"(match: {best_frame['match_score']:.0%}, eyes: {best_frame['eye_state']})")
else:
print(f" β οΈ No good emotional match found, using default frame")
# Fallback: just get middle frame
fallback_frame = get_fallback_frame(cap, dialogue_data, fps)
if fallback_frame is not None:
output_path = os.path.join(final_dir, f"frame{idx:03d}.png")
cv2.imwrite(output_path, fallback_frame)
selected_frames.append({
'path': output_path,
'dialogue': dialogue_data,
'face_emotion': 'unknown',
'match_score': 0.0,
'eye_state': 'unknown'
})
cap.release()
# Summary
print(f"\nπ Emotion-Based Selection Summary:")
print(f"β
Selected {len(selected_frames)} frames based on emotion matching")
if selected_frames:
good_matches = sum(1 for f in selected_frames if f['match_score'] > 0.7)
print(f"π Good emotion matches: {good_matches}/{len(selected_frames)}")
# Count emotions
emotion_counts = {}
for frame in selected_frames:
emotion = frame['face_emotion']
emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
print("\nπ Selected facial expressions:")
for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True):
print(f" {emotion}: {count} frames")
return len(selected_frames) > 0
def find_best_emotional_frame(cap, dialogue_data, fps, face_analyzer, eye_detector, scan_window=2.0):
"""
Find the best frame that matches the dialogue emotion
Scans frames around the dialogue timing to find matching facial expression
"""
target_emotion = dialogue_data['dominant']
text_emotions = dialogue_data['emotions']
# Calculate scan range
center_time = (dialogue_data['start_time'] + dialogue_data['end_time']) / 2
start_time = max(0, center_time - scan_window)
end_time = center_time + scan_window
start_frame = int(start_time * fps)
end_frame = int(end_time * fps)
# Sample frames (don't check every single frame)
num_samples = min(20, end_frame - start_frame) # Check up to 20 frames
if num_samples <= 0:
num_samples = 5
frame_step = max(1, (end_frame - start_frame) // num_samples)
best_match = None
best_score = -1
for frame_num in range(start_frame, end_frame, frame_step):
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if not ret or frame is None:
continue
# Save temp frame for analysis
temp_path = f"temp_emotion_check_{frame_num}.png"
cv2.imwrite(temp_path, frame)
try:
# Check eye state first
eye_state = eye_detector.check_eyes_state(temp_path)
# Skip if eyes are closed or half-closed
if eye_state['state'] in ['closed', 'half_closed']:
continue
# Analyze facial expression
face_emotions = face_analyzer.analyze_expression(temp_path)
face_dominant = max(face_emotions.items(),
key=lambda x: x[1] if x[0] != 'intensity' else 0)[0]
# Calculate match score
score = calculate_emotion_match_score(text_emotions, face_emotions, target_emotion)
# Bonus for good eye state
if eye_state['state'] == 'open':
score *= 1.2
# Update best match
if score > best_score:
best_score = score
best_match = {
'image': frame.copy(),
'face_emotion': face_dominant,
'face_emotions': face_emotions,
'match_score': min(score, 1.0),
'eye_state': eye_state['state'],
'frame_num': frame_num
}
finally:
# Clean up temp file
if os.path.exists(temp_path):
os.remove(temp_path)
return best_match
def calculate_emotion_match_score(text_emotions: Dict, face_emotions: Dict, target_emotion: str) -> float:
"""Calculate how well the face matches the text emotion"""
score = 0.0
# Direct match bonus
if target_emotion in face_emotions and face_emotions[target_emotion] > 0.3:
score += face_emotions[target_emotion] * 2.0
# Check if face has the target emotion as dominant
face_dominant = max(face_emotions.items(),
key=lambda x: x[1] if x[0] != 'intensity' else 0)[0]
if face_dominant == target_emotion:
score += 0.5
# Compare all emotions
for emotion in ['happy', 'sad', 'angry', 'surprised', 'scared', 'neutral']:
text_val = text_emotions.get(emotion, 0)
face_val = face_emotions.get(emotion, 0)
if text_val > 0.3 and face_val > 0.3:
# Both have this emotion
score += min(text_val, face_val) * 0.5
elif text_val > 0.5 and face_val < 0.2:
# Text has emotion but face doesn't - penalty
score -= 0.2
# Intensity matching
text_intensity = text_emotions.get('intensity', 0.5)
face_intensity = face_emotions.get('intensity', 0.5)
intensity_diff = abs(text_intensity - face_intensity)
score += (1 - intensity_diff) * 0.3
return max(0, score)
def get_fallback_frame(cap, dialogue_data, fps):
"""Get a fallback frame from the middle of the dialogue"""
middle_time = (dialogue_data['start_time'] + dialogue_data['end_time']) / 2
frame_num = int(middle_time * fps)
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
return frame if ret else None |