File size: 2,830 Bytes
9ea6dac
f119108
 
 
3578727
f119108
9ea6dac
7115f26
af1f75b
f119108
 
 
 
 
5042ce1
f119108
3578727
7115f26
f119108
 
45de38b
3578727
 
45de38b
 
f119108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358fb0c
7115f26
f119108
9ea6dac
f119108
9ea6dac
7115f26
 
f119108
 
 
 
 
 
7115f26
f119108
 
 
9ea6dac
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
import torch
import torchaudio
import tempfile
import os
import json

def generate_audio(prompt, duration=10):
    try:
        # Import required modules
        from stable_audio_tools.inference.generation import generate_diffusion_cond
        from stable_audio_tools.models.utils import load_ckpt_state_dict
        from stable_audio_tools.models.factory import create_model_from_config
        from huggingface_hub import hf_hub_download
        
        # Get token
        token = os.getenv("HF_TOKEN")
        
        # Download model files
        model_config_path = hf_hub_download(
            repo_id="stabilityai/stable-audio-open-1.0",
            filename="model_config.json",
            token=token
        )
        
        model_ckpt_path = hf_hub_download(
            repo_id="stabilityai/stable-audio-open-1.0", 
            filename="model.safetensors",
            token=token
        )
        
        # Load model config
        with open(model_config_path) as f:
            model_config = json.load(f)
            
        # Create and load model
        model = create_model_from_config(model_config)
        model.load_state_dict(load_ckpt_state_dict(model_ckpt_path))
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = model.to(device)
        model.eval()
        
        # Generate audio
        conditioning = [{
            "prompt": prompt,
            "seconds_start": 0,
            "seconds_total": duration
        }]
        
        # Generate the audio
        output = generate_diffusion_cond(
            model,
            steps=100,
            cfg_scale=7,
            conditioning=conditioning,
            sample_rate=44100,
            sigma_min=0.3,
            sigma_max=500,
            sampler_type="dpmpp-3m-sde",
            device=device
        )
        
        # Convert to numpy and save
        audio_output = output[0].cpu().numpy()
        
        # Create temporary file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        torchaudio.save(temp_file.name, torch.from_numpy(audio_output), 44100)
        
        return temp_file.name
        
    except Exception as e:
        return f"Error: {str(e)}"

# Create interface with AUDIO output
demo = gr.Interface(
    fn=generate_audio,
    inputs=[
        gr.Textbox(
            label="🎡 Audio Prompt", 
            placeholder="heavy boots thudding on wet sand",
            value="heavy boots thudding on wet sand"
        ),
        gr.Slider(5, 47, 10, step=1, label="⏱️ Duration (seconds)")
    ],
    outputs=gr.Audio(label="πŸ”Š Generated Audio"),  # This will play audio!
    title="🎡 Stable Audio Generator - WORKING!",
    description="Generate real audio from text descriptions"
)

demo.launch()