import gradio as gr import asyncio import io import sys sys.path.insert(0, '.') # Mock spaces module for local testing try: import spaces except ImportError: class SpacesMock: @staticmethod def GPU(func): return func spaces = SpacesMock() from maya1.model_loader import Maya1Model from maya1.pipeline import Maya1Pipeline from maya1.prompt_builder import Maya1PromptBuilder from maya1.snac_decoder import SNACDecoder from maya1.constants import AUDIO_SAMPLE_RATE # Preset characters (2 realistic + 2 creative) PRESET_CHARACTERS = { "Realistic: Sarcastic Male (American)": { "description": "Realistic male voice in the 30s age with a american accent. Low pitch, nasally timbre, conversational pacing, sarcastic tone delivery at low intensity, commercial domain, product_demo_voice role, formal delivery", "example_text": " He really stood up there and said we need to save the world. What a joke." }, "Realistic: Excited Female (Asian-American)": { "description": "Realistic female voice in the 20s age with a asian_american accent. Normal pitch, smooth timbre, conversational pacing, neutral tone delivery at high intensity, viral_content domain, meme_voice role, formal delivery", "example_text": " I am issuing a formal commendation for this particular item! It has exceeded all established metrics for excellence. This is something I would actually spend my own money on. Seriously!" }, "Creative: Alpha Leader (Indian)": { "description": "Creative, alpha character. Male voice in their 30s with a indian accent. Normal pitch, nasally timbre, very_fast pacing, energetic tone at medium intensity.", "example_text": " I don't want to hear excuses, I only want to see solutions! Get your teams together, brainstorm for thirty minutes, and come back to me with a plan. Now move!" }, "Creative: Vampire (Middle Eastern)": { "description": "Creative, vampire character. Male voice in their 40s with a middle_eastern accent. Low pitch, nasally timbre, very_slow pacing, excited tone at medium intensity.", "example_text": " Soon you will join me in this magnificent eternal darkness. And we shall feast upon the world together, bound by this exquisite night forever. " } } # Global pipeline variables model = None prompt_builder = None snac_decoder = None pipeline = None @spaces.GPU async def load_models(): """Load Maya1 vLLM model and pipeline (runs once).""" global model, prompt_builder, snac_decoder, pipeline if model is None: print("Loading Maya1 model with vLLM...") model = Maya1Model( model_path="maya-research/maya1", dtype="bfloat16", max_model_len=8192, gpu_memory_utilization=0.85, ) print("Initializing prompt builder...") prompt_builder = Maya1PromptBuilder(model.tokenizer, model) print("Loading SNAC decoder...") snac_decoder = SNACDecoder( device="cuda", enable_batching=False, ) print("Initializing pipeline...") pipeline = Maya1Pipeline(model, prompt_builder, snac_decoder) print("Models loaded successfully!") def preset_selected(preset_name): """Update description and text when preset is selected.""" if preset_name in PRESET_CHARACTERS: char = PRESET_CHARACTERS[preset_name] return char["description"], char["example_text"] return "", "" @spaces.GPU def generate_speech(preset_name, description, text, temperature, max_tokens): """Generate emotional speech from description and text using vLLM.""" try: # Load models if not already loaded asyncio.run(load_models()) # If using preset, override description if preset_name and preset_name in PRESET_CHARACTERS: description = PRESET_CHARACTERS[preset_name]["description"] # Validate inputs if not description or not text: return None, "Error: Please provide both description and text!" print(f"Generating with temperature={temperature}, max_tokens={max_tokens}...") # Generate audio using vLLM pipeline audio_bytes = asyncio.run( pipeline.generate_speech( description=description, text=text, temperature=temperature, top_p=0.9, max_tokens=max_tokens, repetition_penalty=1.1, seed=None, ) ) if audio_bytes is None: return None, "Error: Audio generation failed. Try different text or increase max_tokens." # Convert bytes to WAV file import wave wav_buffer = io.BytesIO() with wave.open(wav_buffer, 'wb') as wav_file: wav_file.setnchannels(1) wav_file.setsampwidth(2) wav_file.setframerate(AUDIO_SAMPLE_RATE) wav_file.writeframes(audio_bytes) wav_buffer.seek(0) # Calculate duration duration = len(audio_bytes) // 2 / AUDIO_SAMPLE_RATE frames = len(audio_bytes) // 2 // (AUDIO_SAMPLE_RATE // 6.86) // 7 status_msg = f"Generated {duration:.2f}s of emotional speech!" return wav_buffer, status_msg except Exception as e: import traceback error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" print(error_msg) return None, error_msg # Create Gradio interface with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # Maya1 - Open Source Emotional Text-to-Speech **The best open source voice AI model with emotions!** Generate realistic and expressive speech with natural language voice design. Choose a preset character or create your own custom voice. [Model](https://huggingface.co/maya-research/maya1) | [GitHub](https://github.com/MayaResearch/maya1-fastapi) """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Character Selection") preset_dropdown = gr.Dropdown( choices=list(PRESET_CHARACTERS.keys()), label="Preset Characters", value=list(PRESET_CHARACTERS.keys())[0], info="Quick pick from 4 preset characters" ) gr.Markdown("### Voice Design") description_input = gr.Textbox( label="Voice Description", placeholder="E.g., Male voice in their 30s with american accent. Normal pitch, warm timbre...", lines=3, value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"] ) text_input = gr.Textbox( label="Text to Speak", placeholder="Enter text with tags like , , ...", lines=4, value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"] ) with gr.Accordion("Advanced Settings", open=False): temperature_slider = gr.Slider( minimum=0.1, maximum=1.0, value=0.4, step=0.1, label="Temperature", info="Lower = more stable, Higher = more creative" ) max_tokens_slider = gr.Slider( minimum=100, maximum=2048, value=500, step=50, label="Max Tokens", info="More tokens = longer audio" ) generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("### Generated Audio") audio_output = gr.Audio( label="Generated Speech", type="filepath", interactive=False ) status_output = gr.Textbox( label="Status", lines=3, interactive=False ) gr.Markdown(""" ### Supported Emotions `` `` `` `` `` `` `` `` `` `` `` `` `` `` `` `` `` `` `` `` ### Tips - Use emotion tags naturally in your text - Longer text needs more max_tokens - Lower temperature for consistent results - Presets are great starting points! """) # Event handlers preset_dropdown.change( fn=preset_selected, inputs=[preset_dropdown], outputs=[description_input, text_input] ) generate_btn.click( fn=generate_speech, inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider], outputs=[audio_output, status_output] ) if __name__ == "__main__": demo.launch()