cacauavatar

Paused

App Files Files Community

Spanicin commited on Feb 28

Commit

6dfeac9

verified ·

1 Parent(s): 527c539

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -159

app.py CHANGED Viewed

@@ -1,160 +1,160 @@
-import argparse
-import tempfile
-import os
-from flask import Flask, request, jsonify
-from omegaconf import OmegaConf
-import torch
-from diffusers import AutoencoderKL, DDIMScheduler
-from latentsync.models.unet import UNet3DConditionModel
-from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
-from diffusers.utils.import_utils import is_xformers_available
-from accelerate.utils import set_seed
-from latentsync.whisper.audio2feature import Audio2Feature
-from openai import OpenAI
-from elevenlabs import set_api_key, generate, play, clone, Voice, VoiceSettings
-# Initialize the Flask app
-app = Flask(__name__)
-TEMP_DIR = None
-def run_inference(video_path, audio_path, video_out_path,
-                  inference_ckpt_path, unet_config_path="configs/unet/second_stage.yaml",
-                  inference_steps=20, guidance_scale=1.0, seed=1247):
-    # Load configuration
-    config = OmegaConf.load(unet_config_path)
-    # Determine proper dtype based on GPU capabilities
-    is_fp16_supported = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] > 7
-    dtype = torch.float16 if is_fp16_supported else torch.float32
-    # Setup scheduler
-    scheduler = DDIMScheduler.from_pretrained("configs")
-    # Choose whisper model based on config settings
-    if config.model.cross_attention_dim == 768:
-        whisper_model_path = "checkpoints/whisper/small.pt"
-    elif config.model.cross_attention_dim == 384:
-        whisper_model_path = "checkpoints/whisper/tiny.pt"
-    else:
-        raise NotImplementedError("cross_attention_dim must be 768 or 384")
-    # Initialize the audio encoder
-    audio_encoder = Audio2Feature(model_path=whisper_model_path,
-                                  device="cuda", num_frames=config.data.num_frames)
-    # Load VAE
-    vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=dtype)
-    vae.config.scaling_factor = 0.18215
-    vae.config.shift_factor = 0
-    # Load UNet model from the checkpoint
-    unet, _ = UNet3DConditionModel.from_pretrained(
-        OmegaConf.to_container(config.model),
-        inference_ckpt_path,  # load checkpoint
-        device="cpu",
-    )
-    unet = unet.to(dtype=dtype)
-    # Optionally enable memory-efficient attention if available
-    if is_xformers_available():
-        unet.enable_xformers_memory_efficient_attention()
-    # Initialize the pipeline and move to GPU
-    pipeline = LipsyncPipeline(
-        vae=vae,
-        audio_encoder=audio_encoder,
-        unet=unet,
-        scheduler=scheduler,
-    ).to("cuda")
-    # Set seed
-    if seed != -1:
-        set_seed(seed)
-    else:
-        torch.seed()
-    # Run the pipeline
-    pipeline(
-        video_path=video_path,
-        audio_path=audio_path,
-        video_out_path=video_out_path,
-        video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
-        num_frames=config.data.num_frames,
-        num_inference_steps=inference_steps,
-        guidance_scale=guidance_scale,
-        weight_dtype=dtype,
-        width=config.data.resolution,
-        height=config.data.resolution,
-    )
-def create_temp_dir():
-    return tempfile.TemporaryDirectory()
-def generate_audio(voice_cloning, text_prompt):
-    if voice_cloning == 'yes':
-        set_api_key('92e149985ea2732b4359c74346c3daee')
-        voice = Voice(voice_id="VJpttplXHolgV2leGe5V",name="Marc",settings=VoiceSettings(
-                        stability=0.71, similarity_boost=0.9, style=0.0, use_speaker_boost=True),)
-        audio = generate(text = text_prompt, voice = voice, model = "eleven_multilingual_v2",stream=True, latency=4)
-        with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="cloned_audio_",dir=TEMP_DIR.name, delete=False) as temp_file:
-            for chunk in audio:
-                temp_file.write(chunk)
-            driven_audio_path = temp_file.name
-            print('driven_audio_path',driven_audio_path)
-    return driven_audio_path
-@app.route('/run', methods=['POST'])
-def generate_video():
-    global TEMP_DIR
-    TEMP_DIR = create_temp_dir()
-    if 'video' not in request.files:
-        return jsonify({'error': 'Video file is required.'}), 400
-    video_file = request.files['video']
-    text_prompt = request.form['text_prompt']
-    print('Input text prompt: ',text_prompt)
-    text_prompt = text_prompt.strip()
-    if not text_prompt:
-        return jsonify({'error': 'Input text prompt cannot be blank'}), 400
-    voice_cloning = 'yes'
-    temp_audio_path = generate_audio(voice_cloning, text_prompt)
-    with tempfile.NamedTemporaryFile(suffix=".mp4", prefix="input_",dir=TEMP_DIR.name, delete=False) as temp_file:
-        temp_video_path = temp_file.name
-        video_file.save(temp_video_path)
-        print('temp_video_path',temp_video_path)
-    output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
-    # You can pass additional parameters via form data if needed (e.g., checkpoint path)
-    inference_ckpt_path = request.form.get('inference_ckpt_path', 'checkpoints/latentsync_unet.pt')
-    unet_config_path = request.form.get('unet_config_path', 'configs/unet/second_stage.yaml')
-    try:
-        run_inference(
-            video_path=temp_video_path.name,
-            audio_path=temp_audio_path.name,
-            video_out_path=output_video,
-            inference_ckpt_path=inference_ckpt_path,
-            unet_config_path=unet_config_path,
-            inference_steps=int(request.form.get('inference_steps', 20)),
-            guidance_scale=float(request.form.get('guidance_scale', 1.0)),
-            seed=int(request.form.get('seed', 1247))
-        )
-        # Return the output video path or further process the file for download
-        return jsonify({'output_video': output_video}), 200
-    except Exception as e:
-        return jsonify({'error': str(e)}), 500
-@app.route("/health", methods=["GET"])
-def health_status():
-    response = {"online": "true"}
-    return jsonify(response)
-if __name__ == '__main__':
     app.run(debug=True)

+import argparse
+import tempfile
+import os
+from flask import Flask, request, jsonify
+from omegaconf import OmegaConf
+import torch
+from diffusers import AutoencoderKL, DDIMScheduler
+from latentsync.models.unet import UNet3DConditionModel
+from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
+from diffusers.utils.import_utils import is_xformers_available
+from accelerate.utils import set_seed
+from latentsync.whisper.audio2feature import Audio2Feature
+from openai import OpenAI
+from elevenlabs import set_api_key, generate, play, clone, Voice, VoiceSettings
+# Initialize the Flask app
+app = Flask(__name__)
+TEMP_DIR = None
+def run_inference(video_path, audio_path, video_out_path,
+                  inference_ckpt_path, unet_config_path="configs/unet/second_stage.yaml",
+                  inference_steps=20, guidance_scale=1.0, seed=1247):
+    # Load configuration
+    config = OmegaConf.load(unet_config_path)
+    # Determine proper dtype based on GPU capabilities
+    is_fp16_supported = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] > 7
+    dtype = torch.float16 if is_fp16_supported else torch.float32
+    # Setup scheduler
+    scheduler = DDIMScheduler.from_pretrained("configs")
+    # Choose whisper model based on config settings
+    if config.model.cross_attention_dim == 768:
+        whisper_model_path = "checkpoints/whisper/small.pt"
+    elif config.model.cross_attention_dim == 384:
+        whisper_model_path = "checkpoints/whisper/tiny.pt"
+    else:
+        raise NotImplementedError("cross_attention_dim must be 768 or 384")
+    # Initialize the audio encoder
+    audio_encoder = Audio2Feature(model_path=whisper_model_path,
+                                  device="cuda", num_frames=config.data.num_frames)
+    # Load VAE
+    vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=dtype)
+    vae.config.scaling_factor = 0.18215
+    vae.config.shift_factor = 0
+    # Load UNet model from the checkpoint
+    unet, _ = UNet3DConditionModel.from_pretrained(
+        OmegaConf.to_container(config.model),
+        inference_ckpt_path,  # load checkpoint
+        device="cpu",
+    )
+    unet = unet.to(dtype=dtype)
+    # Optionally enable memory-efficient attention if available
+    if is_xformers_available():
+        unet.enable_xformers_memory_efficient_attention()
+    # Initialize the pipeline and move to GPU
+    pipeline = LipsyncPipeline(
+        vae=vae,
+        audio_encoder=audio_encoder,
+        unet=unet,
+        scheduler=scheduler,
+    ).to("cuda")
+    # Set seed
+    if seed != -1:
+        set_seed(seed)
+    else:
+        torch.seed()
+    # Run the pipeline
+    pipeline(
+        video_path=video_path,
+        audio_path=audio_path,
+        video_out_path=video_out_path,
+        video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
+        num_frames=config.data.num_frames,
+        num_inference_steps=inference_steps,
+        guidance_scale=guidance_scale,
+        weight_dtype=dtype,
+        width=config.data.resolution,
+        height=config.data.resolution,
+    )
+def create_temp_dir():
+    return tempfile.TemporaryDirectory()
+def generate_audio(voice_cloning, text_prompt):
+    if voice_cloning == 'yes':
+        set_api_key('92e149985ea2732b4359c74346c3daee')
+        voice = Voice(voice_id="VJpttplXHolgV2leGe5V",name="Marc",settings=VoiceSettings(
+                        stability=0.71, similarity_boost=0.9, style=0.0, use_speaker_boost=True),)
+        audio = generate(text = text_prompt, voice = voice, model = "eleven_multilingual_v2",stream=True, latency=4)
+        with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="cloned_audio_",dir=TEMP_DIR.name, delete=False) as temp_file:
+            for chunk in audio:
+                temp_file.write(chunk)
+            driven_audio_path = temp_file.name
+            print('driven_audio_path',driven_audio_path)
+    return driven_audio_path
+@app.route('/run', methods=['POST'])
+def generate_video():
+    global TEMP_DIR
+    TEMP_DIR = create_temp_dir()
+    if 'video' not in request.files:
+        return jsonify({'error': 'Video file is required.'}), 400
+    video_file = request.files['video']
+    text_prompt = request.form['text_prompt']
+    print('Input text prompt: ',text_prompt)
+    text_prompt = text_prompt.strip()
+    if not text_prompt:
+        return jsonify({'error': 'Input text prompt cannot be blank'}), 400
+    voice_cloning = 'yes'
+    temp_audio_path = generate_audio(voice_cloning, text_prompt)
+    with tempfile.NamedTemporaryFile(suffix=".mp4", prefix="input_",dir=TEMP_DIR.name, delete=False) as temp_file:
+        temp_video_path = temp_file.name
+        video_file.save(temp_video_path)
+        print('temp_video_path',temp_video_path)
+    output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
+    # You can pass additional parameters via form data if needed (e.g., checkpoint path)
+    inference_ckpt_path = request.form.get('inference_ckpt_path', 'checkpoints/latentsync_unet.pt')
+    unet_config_path = request.form.get('unet_config_path', 'configs/unet/second_stage.yaml')
+    try:
+        run_inference(
+            video_path=temp_video_path,
+            audio_path=temp_audio_path,
+            video_out_path=output_video,
+            inference_ckpt_path=inference_ckpt_path,
+            unet_config_path=unet_config_path,
+            inference_steps=int(request.form.get('inference_steps', 20)),
+            guidance_scale=float(request.form.get('guidance_scale', 1.0)),
+            seed=int(request.form.get('seed', 1247))
+        )
+        # Return the output video path or further process the file for download
+        return jsonify({'output_video': output_video}), 200
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+@app.route("/health", methods=["GET"])
+def health_status():
+    response = {"online": "true"}
+    return jsonify(response)
+if __name__ == '__main__':
     app.run(debug=True)