File size: 9,548 Bytes
83e35a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
"""
Emotion-based keyframe selection - analyzes emotions FIRST, then selects matching frames
"""

import os
import cv2
import srt
from typing import List, Dict, Tuple
import numpy as np
from backend.enhanced_emotion_matcher import EnhancedEmotionMatcher
from backend.eye_state_detector import EyeStateDetector
from backend.emotion_aware_comic import FacialExpressionAnalyzer

def generate_keyframes_emotion_based(video_path: str, story_subs: List, max_frames: int = 48):
    """
    Generate keyframes by matching facial expressions to dialogue emotions
    
    This analyzes emotions FIRST, then finds the best matching frames
    """
    
    print(f"🎭 Emotion-Based Frame Selection (Analyzing emotions BEFORE frame selection)")
    print(f"πŸ“ Analyzing {len(story_subs)} dialogues for emotions...")
    
    # Initialize analyzers
    emotion_matcher = EnhancedEmotionMatcher()
    face_analyzer = FacialExpressionAnalyzer()
    eye_detector = EyeStateDetector()
    
    # Step 1: Analyze all dialogue emotions first
    dialogue_emotions = []
    for i, sub in enumerate(story_subs[:max_frames]):
        text_emotions = emotion_matcher.analyze_text_emotion(sub.content)
        dominant_emotion = max(text_emotions.items(), 
                             key=lambda x: x[1] if x[0] != 'intensity' else 0)[0]
        
        dialogue_emotions.append({
            'subtitle': sub,
            'text': sub.content,
            'emotions': text_emotions,
            'dominant': dominant_emotion,
            'start_time': sub.start.total_seconds(),
            'end_time': sub.end.total_seconds()
        })
        
        print(f"  πŸ“– Dialogue {i+1}: '{sub.content[:40]}...' β†’ {dominant_emotion}")
    
    print(f"\n🎬 Scanning video for matching facial expressions...")
    
    # Ensure output directory exists
    final_dir = "frames/final"
    os.makedirs(final_dir, exist_ok=True)
    
    # Clear existing frames
    for f in os.listdir(final_dir):
        if f.endswith('.png'):
            os.remove(os.path.join(final_dir, f))
    
    # Open video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"❌ Failed to open video: {video_path}")
        return False
    
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    print(f"πŸ“Ή Video: {fps} fps, {total_frames} total frames")
    
    # Step 2: For each dialogue, find the best matching frame
    selected_frames = []
    
    for idx, dialogue_data in enumerate(dialogue_emotions):
        print(f"\nπŸ” Finding best frame for dialogue {idx+1}: {dialogue_data['dominant']} emotion")
        
        best_frame = find_best_emotional_frame(
            cap, dialogue_data, fps, 
            face_analyzer, eye_detector,
            scan_window=2.0  # Scan 2 seconds around dialogue
        )
        
        if best_frame is not None:
            # Save the selected frame
            output_path = os.path.join(final_dir, f"frame{idx:03d}.png")
            cv2.imwrite(output_path, best_frame['image'])
            
            selected_frames.append({
                'path': output_path,
                'dialogue': dialogue_data,
                'face_emotion': best_frame['face_emotion'],
                'match_score': best_frame['match_score'],
                'eye_state': best_frame['eye_state']
            })
            
            print(f"  βœ… Selected frame with {best_frame['face_emotion']} face " +
                  f"(match: {best_frame['match_score']:.0%}, eyes: {best_frame['eye_state']})")
        else:
            print(f"  ⚠️ No good emotional match found, using default frame")
            # Fallback: just get middle frame
            fallback_frame = get_fallback_frame(cap, dialogue_data, fps)
            if fallback_frame is not None:
                output_path = os.path.join(final_dir, f"frame{idx:03d}.png")
                cv2.imwrite(output_path, fallback_frame)
                selected_frames.append({
                    'path': output_path,
                    'dialogue': dialogue_data,
                    'face_emotion': 'unknown',
                    'match_score': 0.0,
                    'eye_state': 'unknown'
                })
    
    cap.release()
    
    # Summary
    print(f"\nπŸ“Š Emotion-Based Selection Summary:")
    print(f"βœ… Selected {len(selected_frames)} frames based on emotion matching")
    
    if selected_frames:
        good_matches = sum(1 for f in selected_frames if f['match_score'] > 0.7)
        print(f"😊 Good emotion matches: {good_matches}/{len(selected_frames)}")
        
        # Count emotions
        emotion_counts = {}
        for frame in selected_frames:
            emotion = frame['face_emotion']
            emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
        
        print("\n🎭 Selected facial expressions:")
        for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True):
            print(f"  {emotion}: {count} frames")
    
    return len(selected_frames) > 0


def find_best_emotional_frame(cap, dialogue_data, fps, face_analyzer, eye_detector, scan_window=2.0):
    """
    Find the best frame that matches the dialogue emotion
    
    Scans frames around the dialogue timing to find matching facial expression
    """
    
    target_emotion = dialogue_data['dominant']
    text_emotions = dialogue_data['emotions']
    
    # Calculate scan range
    center_time = (dialogue_data['start_time'] + dialogue_data['end_time']) / 2
    start_time = max(0, center_time - scan_window)
    end_time = center_time + scan_window
    
    start_frame = int(start_time * fps)
    end_frame = int(end_time * fps)
    
    # Sample frames (don't check every single frame)
    num_samples = min(20, end_frame - start_frame)  # Check up to 20 frames
    if num_samples <= 0:
        num_samples = 5
    
    frame_step = max(1, (end_frame - start_frame) // num_samples)
    
    best_match = None
    best_score = -1
    
    for frame_num in range(start_frame, end_frame, frame_step):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        
        if not ret or frame is None:
            continue
        
        # Save temp frame for analysis
        temp_path = f"temp_emotion_check_{frame_num}.png"
        cv2.imwrite(temp_path, frame)
        
        try:
            # Check eye state first
            eye_state = eye_detector.check_eyes_state(temp_path)
            
            # Skip if eyes are closed or half-closed
            if eye_state['state'] in ['closed', 'half_closed']:
                continue
            
            # Analyze facial expression
            face_emotions = face_analyzer.analyze_expression(temp_path)
            face_dominant = max(face_emotions.items(), 
                              key=lambda x: x[1] if x[0] != 'intensity' else 0)[0]
            
            # Calculate match score
            score = calculate_emotion_match_score(text_emotions, face_emotions, target_emotion)
            
            # Bonus for good eye state
            if eye_state['state'] == 'open':
                score *= 1.2
            
            # Update best match
            if score > best_score:
                best_score = score
                best_match = {
                    'image': frame.copy(),
                    'face_emotion': face_dominant,
                    'face_emotions': face_emotions,
                    'match_score': min(score, 1.0),
                    'eye_state': eye_state['state'],
                    'frame_num': frame_num
                }
        
        finally:
            # Clean up temp file
            if os.path.exists(temp_path):
                os.remove(temp_path)
    
    return best_match


def calculate_emotion_match_score(text_emotions: Dict, face_emotions: Dict, target_emotion: str) -> float:
    """Calculate how well the face matches the text emotion"""
    
    score = 0.0
    
    # Direct match bonus
    if target_emotion in face_emotions and face_emotions[target_emotion] > 0.3:
        score += face_emotions[target_emotion] * 2.0
    
    # Check if face has the target emotion as dominant
    face_dominant = max(face_emotions.items(), 
                       key=lambda x: x[1] if x[0] != 'intensity' else 0)[0]
    if face_dominant == target_emotion:
        score += 0.5
    
    # Compare all emotions
    for emotion in ['happy', 'sad', 'angry', 'surprised', 'scared', 'neutral']:
        text_val = text_emotions.get(emotion, 0)
        face_val = face_emotions.get(emotion, 0)
        
        if text_val > 0.3 and face_val > 0.3:
            # Both have this emotion
            score += min(text_val, face_val) * 0.5
        elif text_val > 0.5 and face_val < 0.2:
            # Text has emotion but face doesn't - penalty
            score -= 0.2
    
    # Intensity matching
    text_intensity = text_emotions.get('intensity', 0.5)
    face_intensity = face_emotions.get('intensity', 0.5)
    intensity_diff = abs(text_intensity - face_intensity)
    score += (1 - intensity_diff) * 0.3
    
    return max(0, score)


def get_fallback_frame(cap, dialogue_data, fps):
    """Get a fallback frame from the middle of the dialogue"""
    
    middle_time = (dialogue_data['start_time'] + dialogue_data['end_time']) / 2
    frame_num = int(middle_time * fps)
    
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
    ret, frame = cap.read()
    
    return frame if ret else None