Spaces:

ybang
/

stable-audio

Running

File size: 2,830 Bytes

import gradio as gr
import torch
import torchaudio
import tempfile
import os
import json

def generate_audio(prompt, duration=10):
    try:
        # Import required modules
        from stable_audio_tools.inference.generation import generate_diffusion_cond
        from stable_audio_tools.models.utils import load_ckpt_state_dict
        from stable_audio_tools.models.factory import create_model_from_config
        from huggingface_hub import hf_hub_download
        
        # Get token
        token = os.getenv("HF_TOKEN")
        
        # Download model files
        model_config_path = hf_hub_download(
            repo_id="stabilityai/stable-audio-open-1.0",
            filename="model_config.json",
            token=token
        )
        
        model_ckpt_path = hf_hub_download(
            repo_id="stabilityai/stable-audio-open-1.0", 
            filename="model.safetensors",
            token=token
        )
        
        # Load model config
        with open(model_config_path) as f:
            model_config = json.load(f)
            
        # Create and load model
        model = create_model_from_config(model_config)
        model.load_state_dict(load_ckpt_state_dict(model_ckpt_path))
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = model.to(device)
        model.eval()
        
        # Generate audio
        conditioning = [{
            "prompt": prompt,
            "seconds_start": 0,
            "seconds_total": duration
        }]
        
        # Generate the audio
        output = generate_diffusion_cond(
            model,
            steps=100,
            cfg_scale=7,
            conditioning=conditioning,
            sample_rate=44100,
            sigma_min=0.3,
            sigma_max=500,
            sampler_type="dpmpp-3m-sde",
            device=device
        )
        
        # Convert to numpy and save
        audio_output = output[0].cpu().numpy()
        
        # Create temporary file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        torchaudio.save(temp_file.name, torch.from_numpy(audio_output), 44100)
        
        return temp_file.name
        
    except Exception as e:
        return f"Error: {str(e)}"

# Create interface with AUDIO output
demo = gr.Interface(
    fn=generate_audio,
    inputs=[
        gr.Textbox(
            label="🎵 Audio Prompt", 
            placeholder="heavy boots thudding on wet sand",
            value="heavy boots thudding on wet sand"
        ),
        gr.Slider(5, 47, 10, step=1, label="⏱️ Duration (seconds)")
    ],
    outputs=gr.Audio(label="🔊 Generated Audio"),  # This will play audio!
    title="🎵 Stable Audio Generator - WORKING!",
    description="Generate real audio from text descriptions"
)

demo.launch()