import gradio as gr from gradio_client import Client import os import csv import numpy as np import scipy.io.wavfile as wavfile import tempfile client = Client(os.environ['src']) css = """ .gradio-container input::placeholder, .gradio-container textarea::placeholder { color: #333333 !important; } code { background-color: #ffde9f; padding: 2px 4px; border-radius: 3px; } #settings-accordion summary { justify-content: center; } .examples-holder > .label { color: #b45309 !important; font-weight: 600; } """ def load_examples(csv_path): examples = [] if not os.path.exists(csv_path): print(f"Warning: Examples file not found at {csv_path}") return examples try: with open(csv_path, 'r', encoding='utf-8') as f: reader = csv.reader(f, delimiter='|') for row in reader: if len(row) >= 2: text = row[0].strip() audio_path = row[1].strip() # Handle temperature (third column) temperature = 0.7 # Default temperature if len(row) >= 3: try: temp_str = row[2].strip() if temp_str and temp_str.lower() != 'none': temperature = float(temp_str) # Clamp temperature to valid range temperature = max(0.0, min(1.3, temperature)) except (ValueError, TypeError): print(f"Warning: Invalid temperature value '{row[2]}', using default 0.7") temperature = 0.7 # Handle chained longform (fourth column) use_chained = False # Default to False if len(row) >= 4: chained_str = row[3].strip().lower() if chained_str in ['true', '1', 'yes', 'on']: use_chained = True elif chained_str in ['false', '0', 'no', 'off', 'none', '']: use_chained = False else: print(f"Warning: Invalid chained longform value '{row[3]}', using default False") use_chained = False if audio_path.lower() == "none": audio_path = None elif audio_path and not os.path.isabs(audio_path): base_dir = os.path.dirname(csv_path) audio_path = os.path.join(base_dir, audio_path) if not os.path.exists(audio_path): print(f"Warning: Audio file not found: {audio_path}") audio_path = None examples.append([text, audio_path, temperature, use_chained]) except Exception as e: print(f"Error loading examples: {e}") return examples def run_generation_pipeline_client( raw_text, audio_prompt, num_candidates, cfg_scale, top_k, temperature, use_chained_longform, seed # Add seed parameter ): try: # Handle audio prompt - save to temporary file if provided audio_prompt_for_api = None if audio_prompt is not None: import tempfile import scipy.io.wavfile as wavfile sample_rate, audio_data = audio_prompt # Save audio to temporary file with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: # Ensure audio_data is numpy array if isinstance(audio_data, list): audio_data = np.array(audio_data) # Convert to int16 for WAV file if audio_data.dtype == np.float32 or audio_data.dtype == np.float64: audio_data = (audio_data * 32767).astype(np.int16) # Write WAV file wavfile.write(tmp_file.name, sample_rate, audio_data) # Prepare for API - use the file path with proper metadata audio_prompt_for_api = {"path": tmp_file.name, "meta": {"_type": "gradio.FileData"}} # Call the backend API with file path instead of raw audio data result = client.predict( raw_text, audio_prompt_for_api, # Now sending file path with metadata num_candidates, cfg_scale, top_k, temperature, use_chained_longform, seed, # Add seed to API call api_name="/run_generation_pipeline" ) # Clean up temporary file if created if audio_prompt_for_api is not None: import os try: os.unlink(audio_prompt_for_api["path"]) except: pass # Handle the unpacked result if len(result) == 3: # Successful case sample_rate, audio_data, status_message = result if audio_data is not None: if isinstance(audio_data, list): audio_data = np.array(audio_data) return (sample_rate, audio_data), status_message else: return None, status_message elif len(result) == 2: # Failed case return result[0], result[1] # (None, status_message) else: return None, "Status: Unexpected response format from server" except Exception as e: return None, f"Status: Connection error: {str(e)}" # Client wrapper for duration-aware generation - FIXED for audio handling def run_duration_generation_pipeline_client( raw_text, audio_prompt, num_candidates, cfg_scale, top_k, temperature, use_chained_longform, add_steps, use_duration_aware, chars_per_second, seed # Add seed parameter ): try: # Handle audio prompt - save to temporary file if provided audio_prompt_for_api = None if audio_prompt is not None: import tempfile import scipy.io.wavfile as wavfile sample_rate, audio_data = audio_prompt # Save audio to temporary file with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: # Ensure audio_data is numpy array if isinstance(audio_data, list): audio_data = np.array(audio_data) # Convert to int16 for WAV file if audio_data.dtype == np.float32 or audio_data.dtype == np.float64: audio_data = (audio_data * 32767).astype(np.int16) # Write WAV file wavfile.write(tmp_file.name, sample_rate, audio_data) # Prepare for API - use the file path with proper metadata audio_prompt_for_api = {"path": tmp_file.name, "meta": {"_type": "gradio.FileData"}} # Call the backend API with file path instead of raw audio data result = client.predict( raw_text, audio_prompt_for_api, # Now sending file path with metadata num_candidates, cfg_scale, top_k, temperature, use_chained_longform, add_steps, use_duration_aware, chars_per_second, seed, # Add seed to API call api_name="/run_duration_generation_pipeline" ) # Clean up temporary file if created if audio_prompt_for_api is not None: import os try: os.unlink(audio_prompt_for_api["path"]) except: pass # Handle the unpacked result if len(result) == 3: # Successful case sample_rate, audio_data, status_message = result if audio_data is not None: if isinstance(audio_data, list): audio_data = np.array(audio_data) return (sample_rate, audio_data), status_message else: return None, status_message elif len(result) == 2: # Failed case return result[0], result[1] # (None, status_message) else: return None, "Status: Unexpected response format from server" except Exception as e: return None, f"Status: Connection error: {str(e)}" # Load examples examples_csv_path = "./samples.csv" # Adjust path as needed for client side example_list = load_examples(examples_csv_path) # Create Gradio interface with gr.Blocks(theme="Respair/Shiki@9.1.0", css=css) as demo: gr.Markdown('
Takane is a frontier Japanese-only speech synthesis network that was trained on tens of thousands of high quality data to autoregressively generate highly compressed audio codes. This network is powered by Kanadec, the world's only 44.1 kHz - 25 frame rate speech tokenizer which utilizes semantic and acoustic distillation to generate audio tokens as fast as possible.
There are two checkpoints in this demo, one of them utilizes a custom version of Rope to manipulate duration which is seldom seen in autoregressive settings. Please treat it as a proof of concept as its outputs are not very reliable. I'll include it to show that it can work to some levels and can be expanded upon. Both checkpoints have been fine-tuned on a subset of the dataset with only speaker tags. This will allow us to generate high quality samples without relying on audio prompts or dealing with random speaker attributes, but at the cost of tanking the zero-shot faithfulness of the model.
Takane also comes with an Anti-Hallucination Algorithm (AHA) that generates a few candidates in parallel and automatically returns the best one at the cost of introducing a small overhead. If you need the fastest response time possible, feel free to enable the Turbo mode. It will disable AHA and tweak the parameters internally to produce samples as fast as 2-3 seconds.
There's no plan to open-source this model just yet.
If you're not using an audio prompt or a speaker tag, or even if you do, you find the later sentences to be too different, then in that case you may want to enable the Chained mode, which will sequentially condition each output to ensure speaker consistency.
πΈ Takane - Advanced Japanese Text-to-Speech System