Spaces:
Running
on
Zero
Running
on
Zero
Fix errors.
Browse files- app.py +4 -16
- constants.py +15 -0
- inference_cli.py +8 -1
- requirements.txt +6 -3
- utils/video.py +2 -2
app.py
CHANGED
|
@@ -26,17 +26,6 @@ cli.init_model(DEFAULT_MODEL)
|
|
| 26 |
print("Loading speaker model for TTS...")
|
| 27 |
cli.init_speaker_model()
|
| 28 |
|
| 29 |
-
print("Loading G2P model for TTS...")
|
| 30 |
-
from montreal_forced_aligner.g2p.generator import PyniniConsoleGenerator
|
| 31 |
-
if not cli.g2p:
|
| 32 |
-
cli.g2p = PyniniConsoleGenerator(
|
| 33 |
-
g2p_model_path=cli.model.g2p_model_path,
|
| 34 |
-
strict_graphemes=False,
|
| 35 |
-
num_pronunciations=1,
|
| 36 |
-
include_bracketed=False
|
| 37 |
-
)
|
| 38 |
-
cli.g2p.setup()
|
| 39 |
-
|
| 40 |
print("Loading SVS processor for singing voice synthesis...")
|
| 41 |
cli.init_svs_processor()
|
| 42 |
|
|
@@ -233,7 +222,9 @@ with gr.Blocks(
|
|
| 233 |
title="UniFlow-Audio Inference Demo", theme=gr.themes.Soft()
|
| 234 |
) as demo:
|
| 235 |
gr.Markdown("# π UniFlow-Audio Inference Demo")
|
| 236 |
-
gr.Markdown(
|
|
|
|
|
|
|
| 237 |
|
| 238 |
with gr.Tabs():
|
| 239 |
# Tab 1: Text to Audio
|
|
@@ -395,10 +386,6 @@ with gr.Blocks(
|
|
| 395 |
"Hello this is a special sentence with zyloph",
|
| 396 |
"./data/egs/tts_speaker_ref.wav", 5.0, 25
|
| 397 |
],
|
| 398 |
-
[
|
| 399 |
-
"The quick brown fox jumps over the lazy dog",
|
| 400 |
-
"./data/egs/tts_speaker_ref.wav", 5.0, 25
|
| 401 |
-
],
|
| 402 |
],
|
| 403 |
inputs=[
|
| 404 |
tts_transcript, tts_ref_audio, tts_guidance, tts_steps
|
|
@@ -646,6 +633,7 @@ with gr.Blocks(
|
|
| 646 |
- **Model Name**: Choose from `UniFlow-Audio-large`, `UniFlow-Audio-medium`, or `UniFlow-Audio-small`
|
| 647 |
- **Guidance Scale**: Controls the guidance strength of the input condition on the output
|
| 648 |
- **Sampling Steps**: Number of flow matching sampling steps
|
|
|
|
| 649 |
|
| 650 |
π‘ Tip: Models will be automatically downloaded on first run, please be patient
|
| 651 |
"""
|
|
|
|
| 26 |
print("Loading speaker model for TTS...")
|
| 27 |
cli.init_speaker_model()
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
print("Loading SVS processor for singing voice synthesis...")
|
| 30 |
cli.init_svs_processor()
|
| 31 |
|
|
|
|
| 222 |
title="UniFlow-Audio Inference Demo", theme=gr.themes.Soft()
|
| 223 |
) as demo:
|
| 224 |
gr.Markdown("# π UniFlow-Audio Inference Demo")
|
| 225 |
+
gr.Markdown(
|
| 226 |
+
"Multi-task Audio Generation System based on [UniFlow-Audio](https://arxiv.org/abs/2509.24391)"
|
| 227 |
+
)
|
| 228 |
|
| 229 |
with gr.Tabs():
|
| 230 |
# Tab 1: Text to Audio
|
|
|
|
| 386 |
"Hello this is a special sentence with zyloph",
|
| 387 |
"./data/egs/tts_speaker_ref.wav", 5.0, 25
|
| 388 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
],
|
| 390 |
inputs=[
|
| 391 |
tts_transcript, tts_ref_audio, tts_guidance, tts_steps
|
|
|
|
| 633 |
- **Model Name**: Choose from `UniFlow-Audio-large`, `UniFlow-Audio-medium`, or `UniFlow-Audio-small`
|
| 634 |
- **Guidance Scale**: Controls the guidance strength of the input condition on the output
|
| 635 |
- **Sampling Steps**: Number of flow matching sampling steps
|
| 636 |
+
- For TTS, due to the restriction of HuggingFace Space, the g2p phonemizer used here is inconsistant with the one used during training, so there may be problems. Please refer to [INFERENCE_CLI.md](https://github.com/wsntxxn/UniFlow-Audio/blob/master/docs/INFERENCE_CLI.md) for CLI calling guidance.
|
| 637 |
|
| 638 |
π‘ Tip: Models will be automatically downloaded on first run, please be patient
|
| 639 |
"""
|
constants.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TIME_ALIGNED_TASKS = [
|
| 2 |
+
"text_to_speech",
|
| 3 |
+
"singing_voice_synthesis",
|
| 4 |
+
"speech_enhancement",
|
| 5 |
+
"audio_super_resolution",
|
| 6 |
+
"video_to_audio",
|
| 7 |
+
]
|
| 8 |
+
NON_TIME_ALIGNED_TASKS = [
|
| 9 |
+
"text_to_audio",
|
| 10 |
+
"text_to_music",
|
| 11 |
+
]
|
| 12 |
+
SAME_LENGTH_TASKS = [
|
| 13 |
+
"speech_enhancement",
|
| 14 |
+
"audio_super_resolution",
|
| 15 |
+
]
|
inference_cli.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
|
| 3 |
from typing import Any, Callable
|
| 4 |
import json
|
|
|
|
| 5 |
|
| 6 |
import fire
|
| 7 |
import torch
|
|
@@ -149,10 +150,16 @@ class InferenceCLI:
|
|
| 149 |
self.init_speaker_model()
|
| 150 |
|
| 151 |
if not self.g2p:
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
self.g2p = G2p()
|
| 154 |
|
| 155 |
phonemes = self.g2p(transcript)
|
|
|
|
| 156 |
phone_indices = [
|
| 157 |
self.model.tts_phone2id.get(
|
| 158 |
p, self.model.tts_phone2id.get("spn", 0)
|
|
|
|
| 2 |
|
| 3 |
from typing import Any, Callable
|
| 4 |
import json
|
| 5 |
+
import os
|
| 6 |
|
| 7 |
import fire
|
| 8 |
import torch
|
|
|
|
| 150 |
self.init_speaker_model()
|
| 151 |
|
| 152 |
if not self.g2p:
|
| 153 |
+
if not os.path.exists(
|
| 154 |
+
os.path.expanduser(
|
| 155 |
+
"~/nltk_data/taggers/averaged_perceptron_tagger_eng"
|
| 156 |
+
)
|
| 157 |
+
):
|
| 158 |
+
nltk.download("averaged_perceptron_tagger_eng")
|
| 159 |
self.g2p = G2p()
|
| 160 |
|
| 161 |
phonemes = self.g2p(transcript)
|
| 162 |
+
phonemes = [ph for ph in phonemes if ph != " "]
|
| 163 |
phone_indices = [
|
| 164 |
self.model.tts_phone2id.get(
|
| 165 |
p, self.model.tts_phone2id.get("spn", 0)
|
requirements.txt
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
-
torch
|
|
|
|
|
|
|
| 2 |
torchdata
|
| 3 |
diffusers
|
| 4 |
hydra-core
|
|
@@ -10,7 +12,6 @@ einops
|
|
| 10 |
transformers
|
| 11 |
alias_free_torch
|
| 12 |
h5py
|
| 13 |
-
torchaudio
|
| 14 |
soundfile
|
| 15 |
tensorboard
|
| 16 |
swanlab
|
|
@@ -19,4 +20,6 @@ sentencepiece
|
|
| 19 |
librosa
|
| 20 |
pypinyin
|
| 21 |
g2p_en
|
| 22 |
-
git+https://github.com/wenet-e2e/wespeaker.git
|
|
|
|
|
|
|
|
|
| 1 |
+
torch<=2.8.0
|
| 2 |
+
torchaudio<=2.8.0
|
| 3 |
+
torchvision<=0.23.0
|
| 4 |
torchdata
|
| 5 |
diffusers
|
| 6 |
hydra-core
|
|
|
|
| 12 |
transformers
|
| 13 |
alias_free_torch
|
| 14 |
h5py
|
|
|
|
| 15 |
soundfile
|
| 16 |
tensorboard
|
| 17 |
swanlab
|
|
|
|
| 20 |
librosa
|
| 21 |
pypinyin
|
| 22 |
g2p_en
|
| 23 |
+
git+https://github.com/wenet-e2e/wespeaker.git
|
| 24 |
+
moviepy
|
| 25 |
+
av
|
utils/video.py
CHANGED
|
@@ -6,8 +6,8 @@ import tempfile
|
|
| 6 |
import numpy as np
|
| 7 |
import soundfile as sf
|
| 8 |
from moviepy import VideoFileClip, AudioFileClip
|
| 9 |
-
from moviepy.audio.AudioClip import AudioArrayClip
|
| 10 |
-
from moviepy.audio.fx import AudioLoop
|
| 11 |
import torch
|
| 12 |
import torchvision
|
| 13 |
|
|
|
|
| 6 |
import numpy as np
|
| 7 |
import soundfile as sf
|
| 8 |
from moviepy import VideoFileClip, AudioFileClip
|
| 9 |
+
# from moviepy.audio.AudioClip import AudioArrayClip
|
| 10 |
+
# from moviepy.audio.fx import AudioLoop
|
| 11 |
import torch
|
| 12 |
import torchvision
|
| 13 |
|