Spaces:

smartdigitalnetworks
/

maya1

Paused

App Files Files

Veena commited on Nov 6

Commit

30a893c

1 Parent(s): 878b136

Update Maya1 Gradio app with preset characters

Browse files

Files changed (2) hide show

app.py +248 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import gradio as gr
+import asyncio
+import io
+import sys
+sys.path.insert(0, '.')
+# Mock spaces module for local testing
+try:
+    import spaces
+except ImportError:
+    class SpacesMock:
+        @staticmethod
+        def GPU(func):
+            return func
+    spaces = SpacesMock()
+from maya1.model_loader import Maya1Model
+from maya1.pipeline import Maya1Pipeline
+from maya1.prompt_builder import Maya1PromptBuilder
+from maya1.snac_decoder import SNACDecoder
+from maya1.constants import AUDIO_SAMPLE_RATE
+# Preset characters (2 realistic + 2 creative)
+PRESET_CHARACTERS = {
+    "Realistic: Sarcastic Male (American)": {
+        "description": "Realistic male voice in the 30s age with a american accent. Low pitch, nasally timbre, conversational pacing, sarcastic tone delivery at low intensity, commercial domain, product_demo_voice role, formal delivery",
+        "example_text": "<sarcastic> He really stood up there and said we need to <chuckle> save the world. <sigh> What a joke."
+    },
+    "Realistic: Excited Female (Asian-American)": {
+        "description": "Realistic female voice in the 20s age with a asian_american accent. Normal pitch, smooth timbre, conversational pacing, neutral tone delivery at high intensity, viral_content domain, meme_voice role, formal delivery",
+        "example_text": "<excited> I am issuing a formal commendation for this particular item! It has exceeded all established metrics for excellence. <gasp> This is something I would actually spend my own money on. <laugh> Seriously!"
+    },
+    "Creative: Alpha Leader (Indian)": {
+        "description": "Creative, alpha character. Male voice in their 30s with a indian accent. Normal pitch, nasally timbre, very_fast pacing, energetic tone at medium intensity.",
+        "example_text": "<angry> I don't want to hear excuses, I only want to see solutions! <sigh> Get your teams together, brainstorm for thirty minutes, and come back to me with a plan. <excited> Now move!"
+    },
+    "Creative: Vampire (Middle Eastern)": {
+        "description": "Creative, vampire character. Male voice in their 40s with a middle_eastern accent. Low pitch, nasally timbre, very_slow pacing, excited tone at medium intensity.",
+        "example_text": "<whisper> Soon you will join me in this magnificent eternal darkness. <laugh> And we shall feast upon the world together, <excited> bound by this exquisite night forever. <mischievous>"
+    }
+}
+# Global pipeline variables
+model = None
+prompt_builder = None
+snac_decoder = None
+pipeline = None
+@spaces.GPU
+async def load_models():
+    """Load Maya1 vLLM model and pipeline (runs once)."""
+    global model, prompt_builder, snac_decoder, pipeline
+    if model is None:
+        print("Loading Maya1 model with vLLM...")
+        model = Maya1Model(
+            model_path="maya-research/maya1",
+            dtype="bfloat16",
+            max_model_len=8192,
+            gpu_memory_utilization=0.85,
+        )
+        print("Initializing prompt builder...")
+        prompt_builder = Maya1PromptBuilder(model.tokenizer, model)
+        print("Loading SNAC decoder...")
+        snac_decoder = SNACDecoder(
+            device="cuda",
+            enable_batching=False,
+        )
+        print("Initializing pipeline...")
+        pipeline = Maya1Pipeline(model, prompt_builder, snac_decoder)
+        print("Models loaded successfully!")
+def preset_selected(preset_name):
+    """Update description and text when preset is selected."""
+    if preset_name in PRESET_CHARACTERS:
+        char = PRESET_CHARACTERS[preset_name]
+        return char["description"], char["example_text"]
+    return "", ""
+@spaces.GPU
+def generate_speech(preset_name, description, text, temperature, max_tokens):
+    """Generate emotional speech from description and text using vLLM."""
+    try:
+        # Load models if not already loaded
+        asyncio.run(load_models())
+        # If using preset, override description
+        if preset_name and preset_name in PRESET_CHARACTERS:
+            description = PRESET_CHARACTERS[preset_name]["description"]
+        # Validate inputs
+        if not description or not text:
+            return None, "Error: Please provide both description and text!"
+        print(f"Generating with temperature={temperature}, max_tokens={max_tokens}...")
+        # Generate audio using vLLM pipeline
+        audio_bytes = asyncio.run(
+            pipeline.generate_speech(
+                description=description,
+                text=text,
+                temperature=temperature,
+                top_p=0.9,
+                max_tokens=max_tokens,
+                repetition_penalty=1.1,
+                seed=None,
+            )
+        )
+        if audio_bytes is None:
+            return None, "Error: Audio generation failed. Try different text or increase max_tokens."
+        # Convert bytes to WAV file
+        import wave
+        wav_buffer = io.BytesIO()
+        with wave.open(wav_buffer, 'wb') as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)
+            wav_file.setframerate(AUDIO_SAMPLE_RATE)
+            wav_file.writeframes(audio_bytes)
+        wav_buffer.seek(0)
+        # Calculate duration
+        duration = len(audio_bytes) // 2 / AUDIO_SAMPLE_RATE
+        frames = len(audio_bytes) // 2 // (AUDIO_SAMPLE_RATE // 6.86) // 7
+        status_msg = f"Generated {duration:.2f}s of emotional speech!"
+        return wav_buffer, status_msg
+    except Exception as e:
+        import traceback
+        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        return None, error_msg
+# Create Gradio interface
+with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # Maya1 - Open Source Emotional Text-to-Speech
+    **The best open source voice AI model with emotions!**
+    Generate realistic and expressive speech with natural language voice design.
+    Choose a preset character or create your own custom voice.
+    [Model](https://huggingface.co/maya-research/maya1) | [GitHub](https://github.com/MayaResearch/maya1-fastapi)
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Character Selection")
+            preset_dropdown = gr.Dropdown(
+                choices=list(PRESET_CHARACTERS.keys()),
+                label="Preset Characters",
+                value=list(PRESET_CHARACTERS.keys())[0],
+                info="Quick pick from 4 preset characters"
+            )
+            gr.Markdown("### Voice Design")
+            description_input = gr.Textbox(
+                label="Voice Description",
+                placeholder="E.g., Male voice in their 30s with american accent. Normal pitch, warm timbre...",
+                lines=3,
+                value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"]
+            )
+            text_input = gr.Textbox(
+                label="Text to Speak",
+                placeholder="Enter text with <emotion> tags like <laugh>, <sigh>, <excited>...",
+                lines=4,
+                value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"]
+            )
+            with gr.Accordion("Advanced Settings", open=False):
+                temperature_slider = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.4,
+                    step=0.1,
+                    label="Temperature",
+                    info="Lower = more stable, Higher = more creative"
+                )
+                max_tokens_slider = gr.Slider(
+                    minimum=100,
+                    maximum=2048,
+                    value=500,
+                    step=50,
+                    label="Max Tokens",
+                    info="More tokens = longer audio"
+                )
+            generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            gr.Markdown("### Generated Audio")
+            audio_output = gr.Audio(
+                label="Generated Speech",
+                type="filepath",
+                interactive=False
+            )
+            status_output = gr.Textbox(
+                label="Status",
+                lines=3,
+                interactive=False
+            )
+            gr.Markdown("""
+            ### Supported Emotions
+            `<angry>` `<appalled>` `<chuckle>` `<cry>` `<curious>` `<disappointed>`
+            `<excited>` `<exhale>` `<gasp>` `<giggle>` `<gulp>` `<laugh>`
+            `<laugh_harder>` `<mischievous>` `<sarcastic>` `<scream>` `<sigh>`
+            `<sing>` `<snort>` `<whisper>`
+            ### Tips
+            - Use emotion tags naturally in your text
+            - Longer text needs more max_tokens
+            - Lower temperature for consistent results
+            - Presets are great starting points!
+            """)
+    # Event handlers
+    preset_dropdown.change(
+        fn=preset_selected,
+        inputs=[preset_dropdown],
+        outputs=[description_input, text_input]
+    )
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider],
+        outputs=[audio_output, status_output]
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch>=2.5.0
+transformers>=4.57.0
+gradio>=5.0.0
+vllm>=0.11.0
+snac>=1.2.1
+soundfile>=0.13.0
+numpy>=2.1.0
+accelerate>=1.10.0
+xformers>=0.0.32