Spaces:
Running
Running
885
#7
by
hualing0222
- opened
- README.md +1 -1
- app.py +8 -28
- requirements.txt +1 -1
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: π
|
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: pink
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
short_description: Generates audio environment from an image
|
|
|
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: pink
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.0.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
short_description: Generates audio environment from an image
|
app.py
CHANGED
|
@@ -26,7 +26,6 @@ def extract_audio(video_in):
|
|
| 26 |
return 'audio.wav'
|
| 27 |
|
| 28 |
def get_caption_from_kosmos(image_in):
|
| 29 |
-
gr.Info("Generating image caption with Kosmos2...")
|
| 30 |
kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
|
| 31 |
kosmos2_result = kosmos2_client.predict(
|
| 32 |
image_input=handle_file(image_in),
|
|
@@ -87,7 +86,6 @@ def get_magnet(prompt):
|
|
| 87 |
raise gr.Error("MAGNet space API is not ready, please try again in few minutes ")
|
| 88 |
|
| 89 |
def get_audioldm(prompt):
|
| 90 |
-
gr.Info("Now calling AudioLDM2 for SFX ...")
|
| 91 |
try:
|
| 92 |
client = Client("fffiloni/audioldm2-text2audio-text2music-API", hf_token=hf_token)
|
| 93 |
seed = random.randint(0, MAX_SEED)
|
|
@@ -107,21 +105,18 @@ def get_audioldm(prompt):
|
|
| 107 |
raise gr.Error("AudioLDM space API is not ready, please try again in few minutes ")
|
| 108 |
|
| 109 |
def get_audiogen(prompt):
|
| 110 |
-
gr.Info("Now calling AudioGen for SFX ...")
|
| 111 |
try:
|
| 112 |
-
client = Client("fffiloni/
|
| 113 |
result = client.predict(
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
)
|
| 118 |
-
print(result)
|
| 119 |
return result
|
| 120 |
except:
|
| 121 |
raise gr.Error("AudioGen space API is not ready, please try again in few minutes ")
|
| 122 |
|
| 123 |
def get_tango(prompt):
|
| 124 |
-
gr.Info("Now calling AudioGen for SFX ...")
|
| 125 |
try:
|
| 126 |
client = Client("fffiloni/tango", hf_token=hf_token)
|
| 127 |
result = client.predict(
|
|
@@ -155,7 +150,6 @@ def get_tango2(prompt):
|
|
| 155 |
|
| 156 |
|
| 157 |
def get_stable_audio_open(prompt):
|
| 158 |
-
gr.Info("Now calling Stable-Audio for SFX ...")
|
| 159 |
try:
|
| 160 |
client = Client("fffiloni/Stable-Audio-Open-A10", hf_token=hf_token)
|
| 161 |
result = client.predict(
|
|
@@ -190,20 +184,6 @@ def get_ezaudio(prompt):
|
|
| 190 |
raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
|
| 191 |
|
| 192 |
def infer(image_in, chosen_model):
|
| 193 |
-
"""
|
| 194 |
-
Generate an audio clip (sound effect) from an input image using the selected generative model.
|
| 195 |
-
|
| 196 |
-
This function first generates a caption from the provided image using a vision-language model.
|
| 197 |
-
The caption is then used as a text prompt for various audio generation models.
|
| 198 |
-
|
| 199 |
-
Args:
|
| 200 |
-
image_in (str): File path to the input image. The image will be processed to generate a descriptive caption.
|
| 201 |
-
chosen_model (str): The name of the audio generation model to use. Supported options include: "AudioLDM-2", "Tango", "Stable Audio Open".
|
| 202 |
-
|
| 203 |
-
Returns:
|
| 204 |
-
str | dict: The path or result object of the generated audio clip, depending on the model used.
|
| 205 |
-
|
| 206 |
-
"""
|
| 207 |
caption = get_caption_from_kosmos(image_in)
|
| 208 |
if chosen_model == "MAGNet" :
|
| 209 |
magnet_result = get_magnet(caption)
|
|
@@ -251,11 +231,11 @@ with gr.Blocks(css=css) as demo:
|
|
| 251 |
chosen_model = gr.Dropdown(label="Choose a model", choices=[
|
| 252 |
#"MAGNet",
|
| 253 |
"AudioLDM-2",
|
| 254 |
-
"AudioGen",
|
| 255 |
"Tango",
|
| 256 |
-
|
| 257 |
"Stable Audio Open",
|
| 258 |
-
|
| 259 |
], value="AudioLDM-2")
|
| 260 |
submit_btn = gr.Button("Submit")
|
| 261 |
with gr.Column():
|
|
@@ -272,4 +252,4 @@ with gr.Blocks(css=css) as demo:
|
|
| 272 |
outputs=[audio_o],
|
| 273 |
)
|
| 274 |
|
| 275 |
-
demo.queue(max_size=10).launch(debug=True, show_error=True
|
|
|
|
| 26 |
return 'audio.wav'
|
| 27 |
|
| 28 |
def get_caption_from_kosmos(image_in):
|
|
|
|
| 29 |
kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
|
| 30 |
kosmos2_result = kosmos2_client.predict(
|
| 31 |
image_input=handle_file(image_in),
|
|
|
|
| 86 |
raise gr.Error("MAGNet space API is not ready, please try again in few minutes ")
|
| 87 |
|
| 88 |
def get_audioldm(prompt):
|
|
|
|
| 89 |
try:
|
| 90 |
client = Client("fffiloni/audioldm2-text2audio-text2music-API", hf_token=hf_token)
|
| 91 |
seed = random.randint(0, MAX_SEED)
|
|
|
|
| 105 |
raise gr.Error("AudioLDM space API is not ready, please try again in few minutes ")
|
| 106 |
|
| 107 |
def get_audiogen(prompt):
|
|
|
|
| 108 |
try:
|
| 109 |
+
client = Client("https://fffiloni-audiogen.hf.space/")
|
| 110 |
result = client.predict(
|
| 111 |
+
prompt,
|
| 112 |
+
10,
|
| 113 |
+
api_name="/infer"
|
| 114 |
)
|
|
|
|
| 115 |
return result
|
| 116 |
except:
|
| 117 |
raise gr.Error("AudioGen space API is not ready, please try again in few minutes ")
|
| 118 |
|
| 119 |
def get_tango(prompt):
|
|
|
|
| 120 |
try:
|
| 121 |
client = Client("fffiloni/tango", hf_token=hf_token)
|
| 122 |
result = client.predict(
|
|
|
|
| 150 |
|
| 151 |
|
| 152 |
def get_stable_audio_open(prompt):
|
|
|
|
| 153 |
try:
|
| 154 |
client = Client("fffiloni/Stable-Audio-Open-A10", hf_token=hf_token)
|
| 155 |
result = client.predict(
|
|
|
|
| 184 |
raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
|
| 185 |
|
| 186 |
def infer(image_in, chosen_model):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
caption = get_caption_from_kosmos(image_in)
|
| 188 |
if chosen_model == "MAGNet" :
|
| 189 |
magnet_result = get_magnet(caption)
|
|
|
|
| 231 |
chosen_model = gr.Dropdown(label="Choose a model", choices=[
|
| 232 |
#"MAGNet",
|
| 233 |
"AudioLDM-2",
|
| 234 |
+
#"AudioGen",
|
| 235 |
"Tango",
|
| 236 |
+
"Tango 2",
|
| 237 |
"Stable Audio Open",
|
| 238 |
+
"EzAudio"
|
| 239 |
], value="AudioLDM-2")
|
| 240 |
submit_btn = gr.Button("Submit")
|
| 241 |
with gr.Column():
|
|
|
|
| 252 |
outputs=[audio_o],
|
| 253 |
)
|
| 254 |
|
| 255 |
+
demo.queue(max_size=10).launch(debug=True, show_error=True)
|
requirements.txt
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
moviepy
|
|
|
|
| 1 |
+
moviepy
|