lsatone

Running on Zero

File size: 9,271 Bytes

83e35a7

"""
Select the most engaging frames for comic generation
Focuses on visual quality and storytelling, not showing emotion labels
"""

import os
import cv2
import srt
import json  # 👈 ADD THIS LINE
from typing import List, Dict, Tuple
import numpy as np
from backend.enhanced_emotion_matcher import EnhancedEmotionMatcher
from backend.eye_state_detector import EyeStateDetector
from backend.emotion_aware_comic import FacialExpressionAnalyzer

def generate_keyframes_engaging(video_path: str, story_subs: List, max_frames: int = 48):
    """
    Select the most engaging frames for comic generation
    
    Criteria:
    1. Facial expression matches dialogue mood
    2. Eyes are open (no blinking)
    3. Good composition (face visible, not blurry)
    4. Dramatic/interesting moments
    """
    
    print(f"🎬 Selecting most engaging frames for comic generation...")
    print(f"📊 Processing {len(story_subs)} story moments")
    
    # Initialize analyzers (used internally, not shown to user)
    emotion_matcher = EnhancedEmotionMatcher()
    face_analyzer = FacialExpressionAnalyzer()
    eye_detector = EyeStateDetector()
    
    # Ensure output directory exists
    final_dir = "frames/final"
    os.makedirs(final_dir, exist_ok=True)
    
    # Clear existing frames
    for f in os.listdir(final_dir):
        if f.endswith('.png'):
            os.remove(os.path.join(final_dir, f))
    
    # Open video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"❌ Failed to open video: {video_path}")
        return False
    
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    print(f"📹 Analyzing video: {fps:.1f} fps, {total_frames} frames")
    print(f"🔍 Finding best frames for each story moment...")
    
    # Track frame filename -> original timestamp
    frame_metadata = {}
    
    # Process each subtitle
    selected_count = 0
    
    for idx, sub in enumerate(story_subs[:max_frames]):
        # Don't show emotion analysis to user, just use it internally
        text_emotions = emotion_matcher.analyze_text_emotion(sub.content)
        target_mood = max(text_emotions.items(), 
                         key=lambda x: x[1] if x[0] != 'intensity' else 0)[0]
        
        # Progress indicator (simple, not technical)
        if idx % 5 == 0:
            print(f"  Processing moments {idx+1}-{min(idx+5, len(story_subs))}...")
        
        # Find the most engaging frame for this moment
        best_frame = find_most_engaging_frame(
            cap, sub, fps, 
            face_analyzer, eye_detector,
            target_mood, text_emotions
        )
        
        if best_frame is not None:
            # Save the selected frame with consistent naming
            filename = f"frame_{selected_count:03d}.png"
            output_path = os.path.join(final_dir, filename)
            
            # Apply any visual enhancements for comic style
            enhanced_frame = enhance_for_comic(best_frame['image'])
            cv2.imwrite(output_path, enhanced_frame)
            
            # Store original timestamp (midpoint of subtitle)
            original_timestamp = sub.start.total_seconds() + (sub.end.total_seconds() - sub.start.total_seconds()) / 2
            frame_metadata[filename] = original_timestamp
            
            selected_count += 1
        else:
            # Fallback: get a decent frame from the middle
            fallback_frame = get_decent_frame(cap, sub, fps)
            if fallback_frame is not None:
                filename = f"frame_{selected_count:03d}.png"
                output_path = os.path.join(final_dir, filename)
                enhanced_frame = enhance_for_comic(fallback_frame)
                cv2.imwrite(output_path, enhanced_frame)
                
                # Store fallback timestamp
                original_timestamp = sub.start.total_seconds() + (sub.end.total_seconds() - sub.start.total_seconds()) / 2
                frame_metadata[filename] = original_timestamp
                
                selected_count += 1
    
    cap.release()
    
    # Save metadata for regeneration (critical for video-based regenerate)
    with open("frames/frame_metadata.json", "w") as f:
        json.dump(frame_metadata, f, indent=2)
    
    print(f"\n✅ Selected {selected_count} engaging frames for comic")
    print(f"📁 Frames saved to: {final_dir}")
    print(f"💾 Frame metadata saved to: frames/frame_metadata.json")
    
    return selected_count > 0


def find_most_engaging_frame(cap, subtitle, fps, face_analyzer, eye_detector, 
                            target_mood, text_emotions):
    """
    Find the most visually engaging frame for this subtitle
    
    Scoring based on:
    - Expression matching dialogue (internal, not shown)
    - Eye quality (open, alert)
    - Visual composition
    - Sharpness/clarity
    """
    
    # Time window to search
    start_time = subtitle.start.total_seconds()
    end_time = subtitle.end.total_seconds()
    duration = end_time - start_time
    
    # Extend search window slightly for better options
    search_start = max(0, start_time - 0.5)
    search_end = end_time + 0.5
    
    start_frame = int(search_start * fps)
    end_frame = int(search_end * fps)
    
    # Sample frames intelligently
    num_samples = min(15, end_frame - start_frame)
    if num_samples <= 0:
        num_samples = 5
    
    frame_step = max(1, (end_frame - start_frame) // num_samples)
    
    best_frame = None
    best_score = -1
    
    for frame_num in range(start_frame, end_frame, frame_step):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        
        if not ret or frame is None:
            continue
        
        # Calculate engagement score
        score = calculate_engagement_score(
            frame, face_analyzer, eye_detector, 
            target_mood, text_emotions
        )
        
        if score > best_score:
            best_score = score
            best_frame = {
                'image': frame.copy(),
                'score': score,
                'frame_num': frame_num
            }
    
    return best_frame


def calculate_engagement_score(frame, face_analyzer, eye_detector, 
                              target_mood, text_emotions):
    """
    Calculate how engaging/suitable this frame is for the comic
    
    High scores for:
    - Good facial expressions
    - Open eyes
    - Clear image
    - Good composition
    """
    
    score = 0.0
    
    # Save temp for analysis
    temp_path = "temp_frame_analysis.png"
    cv2.imwrite(temp_path, frame)
    
    try:
        # 1. Eye quality (most important for comics)
        eye_state = eye_detector.check_eyes_state(temp_path)
        if eye_state['state'] == 'open':
            score += 3.0
        elif eye_state['state'] == 'partially_open':
            score += 1.5
        elif eye_state['state'] == 'unknown':
            score += 1.0  # No face, might be okay
        else:  # closed or half_closed
            score += 0.0  # Strong penalty
        
        # 2. Expression quality (internal matching)
        face_emotions = face_analyzer.analyze_expression(temp_path)
        
        # Check if expression matches mood
        if target_mood in face_emotions and face_emotions[target_mood] > 0.3:
            score += 2.0 * face_emotions[target_mood]
        
        # General expressiveness (any strong emotion is interesting)
        max_emotion = max(face_emotions.values())
        if max_emotion > 0.5:
            score += 1.0
        
        # 3. Image quality
        sharpness = calculate_sharpness(frame)
        score += sharpness * 0.5
        
        # 4. Composition (face detection confidence)
        if eye_state.get('confidence', 0) > 0.7:
            score += 0.5
        
    finally:
        # Clean up
        if os.path.exists(temp_path):
            os.remove(temp_path)
    
    return score


def calculate_sharpness(frame):
    """Calculate image sharpness using Laplacian variance"""
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    laplacian = cv2.Laplacian(gray, cv2.CV_64F)
    variance = laplacian.var()
    
    # Normalize to 0-1 range
    normalized = min(variance / 500.0, 1.0)
    return normalized


def enhance_for_comic(frame):
    """Apply subtle enhancements to make frame more comic-like"""
    lab = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    l = clahe.apply(l)
    enhanced = cv2.merge([l, a, b])
    enhanced = cv2.cvtColor(enhanced, cv2.COLOR_LAB2BGR)
    return enhanced


def get_decent_frame(cap, subtitle, fps):
    """Get a decent fallback frame"""
    positions = [0.5, 0.3, 0.7, 0.2, 0.8]
    duration = subtitle.end.total_seconds() - subtitle.start.total_seconds()
    for pos in positions:
        time_offset = subtitle.start.total_seconds() + (duration * pos)
        frame_num = int(time_offset * fps)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        if ret and frame is not None:
            if calculate_sharpness(frame) > 0.3:
                return frame
    return None