Veena commited on
Commit
30a893c
·
1 Parent(s): 878b136

Update Maya1 Gradio app with preset characters

Browse files
Files changed (2) hide show
  1. app.py +248 -0
  2. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ import io
4
+ import sys
5
+ sys.path.insert(0, '.')
6
+
7
+ # Mock spaces module for local testing
8
+ try:
9
+ import spaces
10
+ except ImportError:
11
+ class SpacesMock:
12
+ @staticmethod
13
+ def GPU(func):
14
+ return func
15
+ spaces = SpacesMock()
16
+
17
+ from maya1.model_loader import Maya1Model
18
+ from maya1.pipeline import Maya1Pipeline
19
+ from maya1.prompt_builder import Maya1PromptBuilder
20
+ from maya1.snac_decoder import SNACDecoder
21
+ from maya1.constants import AUDIO_SAMPLE_RATE
22
+
23
+ # Preset characters (2 realistic + 2 creative)
24
+ PRESET_CHARACTERS = {
25
+ "Realistic: Sarcastic Male (American)": {
26
+ "description": "Realistic male voice in the 30s age with a american accent. Low pitch, nasally timbre, conversational pacing, sarcastic tone delivery at low intensity, commercial domain, product_demo_voice role, formal delivery",
27
+ "example_text": "<sarcastic> He really stood up there and said we need to <chuckle> save the world. <sigh> What a joke."
28
+ },
29
+ "Realistic: Excited Female (Asian-American)": {
30
+ "description": "Realistic female voice in the 20s age with a asian_american accent. Normal pitch, smooth timbre, conversational pacing, neutral tone delivery at high intensity, viral_content domain, meme_voice role, formal delivery",
31
+ "example_text": "<excited> I am issuing a formal commendation for this particular item! It has exceeded all established metrics for excellence. <gasp> This is something I would actually spend my own money on. <laugh> Seriously!"
32
+ },
33
+ "Creative: Alpha Leader (Indian)": {
34
+ "description": "Creative, alpha character. Male voice in their 30s with a indian accent. Normal pitch, nasally timbre, very_fast pacing, energetic tone at medium intensity.",
35
+ "example_text": "<angry> I don't want to hear excuses, I only want to see solutions! <sigh> Get your teams together, brainstorm for thirty minutes, and come back to me with a plan. <excited> Now move!"
36
+ },
37
+ "Creative: Vampire (Middle Eastern)": {
38
+ "description": "Creative, vampire character. Male voice in their 40s with a middle_eastern accent. Low pitch, nasally timbre, very_slow pacing, excited tone at medium intensity.",
39
+ "example_text": "<whisper> Soon you will join me in this magnificent eternal darkness. <laugh> And we shall feast upon the world together, <excited> bound by this exquisite night forever. <mischievous>"
40
+ }
41
+ }
42
+
43
+ # Global pipeline variables
44
+ model = None
45
+ prompt_builder = None
46
+ snac_decoder = None
47
+ pipeline = None
48
+
49
+ @spaces.GPU
50
+ async def load_models():
51
+ """Load Maya1 vLLM model and pipeline (runs once)."""
52
+ global model, prompt_builder, snac_decoder, pipeline
53
+
54
+ if model is None:
55
+ print("Loading Maya1 model with vLLM...")
56
+ model = Maya1Model(
57
+ model_path="maya-research/maya1",
58
+ dtype="bfloat16",
59
+ max_model_len=8192,
60
+ gpu_memory_utilization=0.85,
61
+ )
62
+
63
+ print("Initializing prompt builder...")
64
+ prompt_builder = Maya1PromptBuilder(model.tokenizer, model)
65
+
66
+ print("Loading SNAC decoder...")
67
+ snac_decoder = SNACDecoder(
68
+ device="cuda",
69
+ enable_batching=False,
70
+ )
71
+
72
+ print("Initializing pipeline...")
73
+ pipeline = Maya1Pipeline(model, prompt_builder, snac_decoder)
74
+
75
+ print("Models loaded successfully!")
76
+
77
+ def preset_selected(preset_name):
78
+ """Update description and text when preset is selected."""
79
+ if preset_name in PRESET_CHARACTERS:
80
+ char = PRESET_CHARACTERS[preset_name]
81
+ return char["description"], char["example_text"]
82
+ return "", ""
83
+
84
+ @spaces.GPU
85
+ def generate_speech(preset_name, description, text, temperature, max_tokens):
86
+ """Generate emotional speech from description and text using vLLM."""
87
+ try:
88
+ # Load models if not already loaded
89
+ asyncio.run(load_models())
90
+
91
+ # If using preset, override description
92
+ if preset_name and preset_name in PRESET_CHARACTERS:
93
+ description = PRESET_CHARACTERS[preset_name]["description"]
94
+
95
+ # Validate inputs
96
+ if not description or not text:
97
+ return None, "Error: Please provide both description and text!"
98
+
99
+ print(f"Generating with temperature={temperature}, max_tokens={max_tokens}...")
100
+
101
+ # Generate audio using vLLM pipeline
102
+ audio_bytes = asyncio.run(
103
+ pipeline.generate_speech(
104
+ description=description,
105
+ text=text,
106
+ temperature=temperature,
107
+ top_p=0.9,
108
+ max_tokens=max_tokens,
109
+ repetition_penalty=1.1,
110
+ seed=None,
111
+ )
112
+ )
113
+
114
+ if audio_bytes is None:
115
+ return None, "Error: Audio generation failed. Try different text or increase max_tokens."
116
+
117
+ # Convert bytes to WAV file
118
+ import wave
119
+ wav_buffer = io.BytesIO()
120
+ with wave.open(wav_buffer, 'wb') as wav_file:
121
+ wav_file.setnchannels(1)
122
+ wav_file.setsampwidth(2)
123
+ wav_file.setframerate(AUDIO_SAMPLE_RATE)
124
+ wav_file.writeframes(audio_bytes)
125
+
126
+ wav_buffer.seek(0)
127
+
128
+ # Calculate duration
129
+ duration = len(audio_bytes) // 2 / AUDIO_SAMPLE_RATE
130
+ frames = len(audio_bytes) // 2 // (AUDIO_SAMPLE_RATE // 6.86) // 7
131
+
132
+ status_msg = f"Generated {duration:.2f}s of emotional speech!"
133
+
134
+ return wav_buffer, status_msg
135
+
136
+ except Exception as e:
137
+ import traceback
138
+ error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
139
+ print(error_msg)
140
+ return None, error_msg
141
+
142
+ # Create Gradio interface
143
+ with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo:
144
+ gr.Markdown("""
145
+ # Maya1 - Open Source Emotional Text-to-Speech
146
+
147
+ **The best open source voice AI model with emotions!**
148
+
149
+ Generate realistic and expressive speech with natural language voice design.
150
+ Choose a preset character or create your own custom voice.
151
+
152
+ [Model](https://huggingface.co/maya-research/maya1) | [GitHub](https://github.com/MayaResearch/maya1-fastapi)
153
+ """)
154
+
155
+ with gr.Row():
156
+ with gr.Column(scale=1):
157
+ gr.Markdown("### Character Selection")
158
+
159
+ preset_dropdown = gr.Dropdown(
160
+ choices=list(PRESET_CHARACTERS.keys()),
161
+ label="Preset Characters",
162
+ value=list(PRESET_CHARACTERS.keys())[0],
163
+ info="Quick pick from 4 preset characters"
164
+ )
165
+
166
+ gr.Markdown("### Voice Design")
167
+
168
+ description_input = gr.Textbox(
169
+ label="Voice Description",
170
+ placeholder="E.g., Male voice in their 30s with american accent. Normal pitch, warm timbre...",
171
+ lines=3,
172
+ value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"]
173
+ )
174
+
175
+ text_input = gr.Textbox(
176
+ label="Text to Speak",
177
+ placeholder="Enter text with <emotion> tags like <laugh>, <sigh>, <excited>...",
178
+ lines=4,
179
+ value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"]
180
+ )
181
+
182
+ with gr.Accordion("Advanced Settings", open=False):
183
+ temperature_slider = gr.Slider(
184
+ minimum=0.1,
185
+ maximum=1.0,
186
+ value=0.4,
187
+ step=0.1,
188
+ label="Temperature",
189
+ info="Lower = more stable, Higher = more creative"
190
+ )
191
+
192
+ max_tokens_slider = gr.Slider(
193
+ minimum=100,
194
+ maximum=2048,
195
+ value=500,
196
+ step=50,
197
+ label="Max Tokens",
198
+ info="More tokens = longer audio"
199
+ )
200
+
201
+ generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
202
+
203
+ with gr.Column(scale=1):
204
+ gr.Markdown("### Generated Audio")
205
+
206
+ audio_output = gr.Audio(
207
+ label="Generated Speech",
208
+ type="filepath",
209
+ interactive=False
210
+ )
211
+
212
+ status_output = gr.Textbox(
213
+ label="Status",
214
+ lines=3,
215
+ interactive=False
216
+ )
217
+
218
+ gr.Markdown("""
219
+ ### Supported Emotions
220
+
221
+ `<angry>` `<appalled>` `<chuckle>` `<cry>` `<curious>` `<disappointed>`
222
+ `<excited>` `<exhale>` `<gasp>` `<giggle>` `<gulp>` `<laugh>`
223
+ `<laugh_harder>` `<mischievous>` `<sarcastic>` `<scream>` `<sigh>`
224
+ `<sing>` `<snort>` `<whisper>`
225
+
226
+ ### Tips
227
+ - Use emotion tags naturally in your text
228
+ - Longer text needs more max_tokens
229
+ - Lower temperature for consistent results
230
+ - Presets are great starting points!
231
+ """)
232
+
233
+ # Event handlers
234
+ preset_dropdown.change(
235
+ fn=preset_selected,
236
+ inputs=[preset_dropdown],
237
+ outputs=[description_input, text_input]
238
+ )
239
+
240
+ generate_btn.click(
241
+ fn=generate_speech,
242
+ inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider],
243
+ outputs=[audio_output, status_output]
244
+ )
245
+
246
+ if __name__ == "__main__":
247
+ demo.launch()
248
+
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.5.0
2
+ transformers>=4.57.0
3
+ gradio>=5.0.0
4
+ vllm>=0.11.0
5
+ snac>=1.2.1
6
+ soundfile>=0.13.0
7
+ numpy>=2.1.0
8
+ accelerate>=1.10.0
9
+ xformers>=0.0.32
10
+