|
|
""" |
|
|
Emotion-based keyframe selection - analyzes emotions FIRST, then selects matching frames |
|
|
""" |
|
|
|
|
|
import os |
|
|
import cv2 |
|
|
import srt |
|
|
from typing import List, Dict, Tuple |
|
|
import numpy as np |
|
|
from backend.enhanced_emotion_matcher import EnhancedEmotionMatcher |
|
|
from backend.eye_state_detector import EyeStateDetector |
|
|
from backend.emotion_aware_comic import FacialExpressionAnalyzer |
|
|
|
|
|
def generate_keyframes_emotion_based(video_path: str, story_subs: List, max_frames: int = 48): |
|
|
""" |
|
|
Generate keyframes by matching facial expressions to dialogue emotions |
|
|
|
|
|
This analyzes emotions FIRST, then finds the best matching frames |
|
|
""" |
|
|
|
|
|
print(f"🎭 Emotion-Based Frame Selection (Analyzing emotions BEFORE frame selection)") |
|
|
print(f"📝 Analyzing {len(story_subs)} dialogues for emotions...") |
|
|
|
|
|
|
|
|
emotion_matcher = EnhancedEmotionMatcher() |
|
|
face_analyzer = FacialExpressionAnalyzer() |
|
|
eye_detector = EyeStateDetector() |
|
|
|
|
|
|
|
|
dialogue_emotions = [] |
|
|
for i, sub in enumerate(story_subs[:max_frames]): |
|
|
text_emotions = emotion_matcher.analyze_text_emotion(sub.content) |
|
|
dominant_emotion = max(text_emotions.items(), |
|
|
key=lambda x: x[1] if x[0] != 'intensity' else 0)[0] |
|
|
|
|
|
dialogue_emotions.append({ |
|
|
'subtitle': sub, |
|
|
'text': sub.content, |
|
|
'emotions': text_emotions, |
|
|
'dominant': dominant_emotion, |
|
|
'start_time': sub.start.total_seconds(), |
|
|
'end_time': sub.end.total_seconds() |
|
|
}) |
|
|
|
|
|
print(f" 📖 Dialogue {i+1}: '{sub.content[:40]}...' → {dominant_emotion}") |
|
|
|
|
|
print(f"\n🎬 Scanning video for matching facial expressions...") |
|
|
|
|
|
|
|
|
final_dir = "frames/final" |
|
|
os.makedirs(final_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
for f in os.listdir(final_dir): |
|
|
if f.endswith('.png'): |
|
|
os.remove(os.path.join(final_dir, f)) |
|
|
|
|
|
|
|
|
cap = cv2.VideoCapture(video_path) |
|
|
if not cap.isOpened(): |
|
|
print(f"❌ Failed to open video: {video_path}") |
|
|
return False |
|
|
|
|
|
fps = cap.get(cv2.CAP_PROP_FPS) |
|
|
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
|
|
|
|
print(f"📹 Video: {fps} fps, {total_frames} total frames") |
|
|
|
|
|
|
|
|
selected_frames = [] |
|
|
|
|
|
for idx, dialogue_data in enumerate(dialogue_emotions): |
|
|
print(f"\n🔍 Finding best frame for dialogue {idx+1}: {dialogue_data['dominant']} emotion") |
|
|
|
|
|
best_frame = find_best_emotional_frame( |
|
|
cap, dialogue_data, fps, |
|
|
face_analyzer, eye_detector, |
|
|
scan_window=2.0 |
|
|
) |
|
|
|
|
|
if best_frame is not None: |
|
|
|
|
|
output_path = os.path.join(final_dir, f"frame{idx:03d}.png") |
|
|
cv2.imwrite(output_path, best_frame['image']) |
|
|
|
|
|
selected_frames.append({ |
|
|
'path': output_path, |
|
|
'dialogue': dialogue_data, |
|
|
'face_emotion': best_frame['face_emotion'], |
|
|
'match_score': best_frame['match_score'], |
|
|
'eye_state': best_frame['eye_state'] |
|
|
}) |
|
|
|
|
|
print(f" ✅ Selected frame with {best_frame['face_emotion']} face " + |
|
|
f"(match: {best_frame['match_score']:.0%}, eyes: {best_frame['eye_state']})") |
|
|
else: |
|
|
print(f" ⚠️ No good emotional match found, using default frame") |
|
|
|
|
|
fallback_frame = get_fallback_frame(cap, dialogue_data, fps) |
|
|
if fallback_frame is not None: |
|
|
output_path = os.path.join(final_dir, f"frame{idx:03d}.png") |
|
|
cv2.imwrite(output_path, fallback_frame) |
|
|
selected_frames.append({ |
|
|
'path': output_path, |
|
|
'dialogue': dialogue_data, |
|
|
'face_emotion': 'unknown', |
|
|
'match_score': 0.0, |
|
|
'eye_state': 'unknown' |
|
|
}) |
|
|
|
|
|
cap.release() |
|
|
|
|
|
|
|
|
print(f"\n📊 Emotion-Based Selection Summary:") |
|
|
print(f"✅ Selected {len(selected_frames)} frames based on emotion matching") |
|
|
|
|
|
if selected_frames: |
|
|
good_matches = sum(1 for f in selected_frames if f['match_score'] > 0.7) |
|
|
print(f"😊 Good emotion matches: {good_matches}/{len(selected_frames)}") |
|
|
|
|
|
|
|
|
emotion_counts = {} |
|
|
for frame in selected_frames: |
|
|
emotion = frame['face_emotion'] |
|
|
emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1 |
|
|
|
|
|
print("\n🎭 Selected facial expressions:") |
|
|
for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True): |
|
|
print(f" {emotion}: {count} frames") |
|
|
|
|
|
return len(selected_frames) > 0 |
|
|
|
|
|
|
|
|
def find_best_emotional_frame(cap, dialogue_data, fps, face_analyzer, eye_detector, scan_window=2.0): |
|
|
""" |
|
|
Find the best frame that matches the dialogue emotion |
|
|
|
|
|
Scans frames around the dialogue timing to find matching facial expression |
|
|
""" |
|
|
|
|
|
target_emotion = dialogue_data['dominant'] |
|
|
text_emotions = dialogue_data['emotions'] |
|
|
|
|
|
|
|
|
center_time = (dialogue_data['start_time'] + dialogue_data['end_time']) / 2 |
|
|
start_time = max(0, center_time - scan_window) |
|
|
end_time = center_time + scan_window |
|
|
|
|
|
start_frame = int(start_time * fps) |
|
|
end_frame = int(end_time * fps) |
|
|
|
|
|
|
|
|
num_samples = min(20, end_frame - start_frame) |
|
|
if num_samples <= 0: |
|
|
num_samples = 5 |
|
|
|
|
|
frame_step = max(1, (end_frame - start_frame) // num_samples) |
|
|
|
|
|
best_match = None |
|
|
best_score = -1 |
|
|
|
|
|
for frame_num in range(start_frame, end_frame, frame_step): |
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num) |
|
|
ret, frame = cap.read() |
|
|
|
|
|
if not ret or frame is None: |
|
|
continue |
|
|
|
|
|
|
|
|
temp_path = f"temp_emotion_check_{frame_num}.png" |
|
|
cv2.imwrite(temp_path, frame) |
|
|
|
|
|
try: |
|
|
|
|
|
eye_state = eye_detector.check_eyes_state(temp_path) |
|
|
|
|
|
|
|
|
if eye_state['state'] in ['closed', 'half_closed']: |
|
|
continue |
|
|
|
|
|
|
|
|
face_emotions = face_analyzer.analyze_expression(temp_path) |
|
|
face_dominant = max(face_emotions.items(), |
|
|
key=lambda x: x[1] if x[0] != 'intensity' else 0)[0] |
|
|
|
|
|
|
|
|
score = calculate_emotion_match_score(text_emotions, face_emotions, target_emotion) |
|
|
|
|
|
|
|
|
if eye_state['state'] == 'open': |
|
|
score *= 1.2 |
|
|
|
|
|
|
|
|
if score > best_score: |
|
|
best_score = score |
|
|
best_match = { |
|
|
'image': frame.copy(), |
|
|
'face_emotion': face_dominant, |
|
|
'face_emotions': face_emotions, |
|
|
'match_score': min(score, 1.0), |
|
|
'eye_state': eye_state['state'], |
|
|
'frame_num': frame_num |
|
|
} |
|
|
|
|
|
finally: |
|
|
|
|
|
if os.path.exists(temp_path): |
|
|
os.remove(temp_path) |
|
|
|
|
|
return best_match |
|
|
|
|
|
|
|
|
def calculate_emotion_match_score(text_emotions: Dict, face_emotions: Dict, target_emotion: str) -> float: |
|
|
"""Calculate how well the face matches the text emotion""" |
|
|
|
|
|
score = 0.0 |
|
|
|
|
|
|
|
|
if target_emotion in face_emotions and face_emotions[target_emotion] > 0.3: |
|
|
score += face_emotions[target_emotion] * 2.0 |
|
|
|
|
|
|
|
|
face_dominant = max(face_emotions.items(), |
|
|
key=lambda x: x[1] if x[0] != 'intensity' else 0)[0] |
|
|
if face_dominant == target_emotion: |
|
|
score += 0.5 |
|
|
|
|
|
|
|
|
for emotion in ['happy', 'sad', 'angry', 'surprised', 'scared', 'neutral']: |
|
|
text_val = text_emotions.get(emotion, 0) |
|
|
face_val = face_emotions.get(emotion, 0) |
|
|
|
|
|
if text_val > 0.3 and face_val > 0.3: |
|
|
|
|
|
score += min(text_val, face_val) * 0.5 |
|
|
elif text_val > 0.5 and face_val < 0.2: |
|
|
|
|
|
score -= 0.2 |
|
|
|
|
|
|
|
|
text_intensity = text_emotions.get('intensity', 0.5) |
|
|
face_intensity = face_emotions.get('intensity', 0.5) |
|
|
intensity_diff = abs(text_intensity - face_intensity) |
|
|
score += (1 - intensity_diff) * 0.3 |
|
|
|
|
|
return max(0, score) |
|
|
|
|
|
|
|
|
def get_fallback_frame(cap, dialogue_data, fps): |
|
|
"""Get a fallback frame from the middle of the dialogue""" |
|
|
|
|
|
middle_time = (dialogue_data['start_time'] + dialogue_data['end_time']) / 2 |
|
|
frame_num = int(middle_time * fps) |
|
|
|
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num) |
|
|
ret, frame = cap.read() |
|
|
|
|
|
return frame if ret else None |