Benjamin14's picture
Add 30-second limit warning in UI
98c58ba
#!/usr/bin/env python3
"""
Application Gradio pour la traduction audio en temps réel.
Version simplifiée compatible avec Hugging Face Spaces (zero GPU).
"""
import os
import tempfile
import time
import gradio as gr
import numpy as np
import torch
import librosa
import soundfile as sf
from typing import Optional, Tuple
# Import spaces pour Zero GPU
try:
import spaces
SPACES_ZERO_GPU = True
print("✅ Spaces Zero GPU disponible")
except ImportError:
SPACES_ZERO_GPU = False
print("⚠️ Spaces Zero GPU non disponible (mode local)")
# Créer un décorateur factice pour le mode local
class spaces:
@staticmethod
def GPU(func):
return func
# Détection de l'environnement
IS_HUGGINGFACE_SPACES = os.getenv("SPACE_ID") is not None
IS_LOCAL = not IS_HUGGINGFACE_SPACES
# Configuration selon l'environnement
if IS_HUGGINGFACE_SPACES:
print("🌐 Environnement détecté: Hugging Face Spaces")
else:
print("🏠 Environnement détecté: Local")
# Import pour STT Whisper
from transformers import pipeline
# Classes simplifiées intégrées
class WhisperSTTEngine:
"""Moteur STT avec Whisper (simple et efficace)."""
def __init__(self, device="cpu"):
self.device = device
self.model = None
print("🎙️ Initialisation du moteur STT Whisper...")
def load_model(self):
"""Charge le modèle Whisper."""
if self.model is not None:
return True
try:
print("📥 Chargement du modèle Whisper...")
# Utiliser un modèle Whisper léger avec support FR/EN
self.model = pipeline(
"automatic-speech-recognition",
model="openai/whisper-small", # Small pour meilleur support multilingue
device=0 if self.device == "cuda" else -1
)
print("✅ Modèle Whisper chargé avec succès!")
return True
except Exception as e:
print(f"❌ Erreur chargement Whisper: {e}")
return False
def transcribe(self, audio_path: str, language: str = "fr") -> str:
"""Transcrit un fichier audio dans la langue spécifiée."""
if self.model is None:
if not self.load_model():
return ""
try:
print(f"🎙️ Transcription de: {audio_path} (langue: {language})")
# Charger l'audio avec librosa (évite le besoin de ffmpeg)
import librosa
audio_array, sample_rate = librosa.load(audio_path, sr=16000) # Whisper utilise 16kHz
# Limiter à 30 secondes maximum (3000 mel features = ~30s)
max_duration = 30 # secondes
max_samples = max_duration * sample_rate
if len(audio_array) > max_samples:
audio_array = audio_array[:max_samples]
print(f"✂️ Audio tronqué à {max_duration} secondes")
# Convertir le code langue (fr/en) en langue complète pour Whisper
lang_map = {
"fr": "french",
"en": "english"
}
whisper_lang = lang_map.get(language, "french")
# Passer l'array audio directement à Whisper avec la langue forcée
result = self.model(
audio_array,
generate_kwargs={
"language": whisper_lang,
"task": "transcribe" # Transcription (pas traduction)
}
)
transcription = result["text"].strip()
print(f"✅ Transcription ({language}): {transcription}")
return transcription
except Exception as e:
print(f"❌ Erreur transcription: {e}")
import traceback
traceback.print_exc()
return ""
class SimpleTTSEngine:
"""Moteur TTS avec gTTS (Simple et fiable)."""
def __init__(self):
print("🔊 Moteur TTS gTTS initialisé")
def generate_speech(self, text: str, voice: str = None) -> str:
"""Génère la synthèse vocale avec gTTS."""
if not text.strip():
return None
try:
from gtts import gTTS
# Déterminer la langue selon le texte
if voice and "fr" in voice.lower():
lang = "fr"
elif voice and "en" in voice.lower():
lang = "en"
else:
# Détecter la langue du texte
if any(char in text for char in "àâäéèêëïîôöùûüÿç"):
lang = "fr"
else:
lang = "en"
print(f"🎤 Génération audio avec gTTS (langue: {lang})")
# Créer un fichier temporaire
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
temp_path = temp_file.name
temp_file.close()
# Générer l'audio avec gTTS
tts = gTTS(text=text, lang=lang, slow=False)
tts.save(temp_path)
print(f"✅ Audio généré: {temp_path}")
return temp_path
except Exception as e:
print(f"❌ Erreur TTS: {e}")
import traceback
traceback.print_exc()
return None
def get_voice_for_language(self, language: str) -> str:
"""Retourne la langue appropriée pour gTTS."""
voices = {
"fr": "fr", # Français
"en": "en" # Anglais
}
return voices.get(language, "fr")
class SimpleTranslationEngine:
"""Moteur de traduction avec modèle Helsinki-NLP."""
def __init__(self, device="cpu"):
self.device = device
self.models_ready = False
self.translator_fr_en = None
self.translator_en_fr = None
print("🌍 Moteur de traduction initialisé")
def load_models(self):
"""Charge les modèles de traduction."""
if self.models_ready:
return True
try:
print("📥 Chargement des modèles de traduction...")
# Modèle FR -> EN
self.translator_fr_en = pipeline(
"translation",
model="Helsinki-NLP/opus-mt-fr-en",
device=0 if self.device == "cuda" else -1
)
# Modèle EN -> FR
self.translator_en_fr = pipeline(
"translation",
model="Helsinki-NLP/opus-mt-en-fr",
device=0 if self.device == "cuda" else -1
)
self.models_ready = True
print("✅ Modèles de traduction chargés avec succès!")
return True
except Exception as e:
print(f"❌ Erreur chargement traduction: {e}")
return False
def translate(self, text: str, source_lang: str = "fr", target_lang: str = "en") -> str:
"""Traduit le texte avec Helsinki-NLP."""
if not text.strip():
return ""
try:
# Charger les modèles si nécessaire
if not self.load_models():
return f"[Erreur: Modèles non chargés] {text}"
# Déterminer le traducteur à utiliser
if source_lang == "fr" and target_lang == "en":
translator = self.translator_fr_en
elif source_lang == "en" and target_lang == "fr":
translator = self.translator_en_fr
else:
return f"[Direction {source_lang}{target_lang} non supportée] {text}"
# Traduction
result = translator(text)
translation = result[0]["translation_text"].strip()
# Nettoyer la traduction
if '\n' in translation:
translation = translation.split('\n')[0]
# Nettoyer les préfixes de traduction
prefixes_to_remove = [
"Translation:", "Traduction:", "English:", "Français:",
"French:", "Anglais:", "Here's the translation:",
"Voici la traduction:", "The translation is:",
"La traduction est:"
]
for prefix in prefixes_to_remove:
if translation.lower().startswith(prefix.lower()):
translation = translation[len(prefix):].strip()
break
# Nettoyer les guillemets en début/fin
if translation.startswith('"') and translation.endswith('"'):
translation = translation[1:-1]
elif translation.startswith("'") and translation.endswith("'"):
translation = translation[1:-1]
return translation if translation else text
except Exception as e:
print(f"❌ Erreur traduction: {e}")
import traceback
traceback.print_exc()
return f"[Erreur: {str(e)[:50]}]"
class AudioTranslationApp:
"""Application Gradio pour la traduction audio."""
def __init__(self):
"""Initialise l'application."""
# Configuration du device selon l'environnement
if IS_HUGGINGFACE_SPACES and SPACES_ZERO_GPU:
# Utiliser GPU via Zero GPU de Hugging Face Spaces
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print("🚀 Zero GPU activé - GPU disponible à la demande")
elif IS_HUGGINGFACE_SPACES:
self.device = "cpu" # Force CPU pour Hugging Face Spaces sans Zero GPU
else:
# En local, utiliser GPU si disponible
self.device = "cuda" if torch.cuda.is_available() else "cpu"
# Moteurs simplifiés
self.stt_engine = WhisperSTTEngine(device=self.device)
self.tts_engine = SimpleTTSEngine()
self.translation_engine = SimpleTranslationEngine(device=self.device)
# État de l'application
self.is_processing = False
self.current_audio_path = None
print("🚀 Application de traduction audio initialisée")
print(f"🔧 Device utilisé: {self.device}")
if self.device == "cuda":
print(f"🚀 GPU détecté: {torch.cuda.get_device_name(0)}")
print(f"💾 Mémoire GPU: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
def load_stt_model(self):
"""Charge le modèle STT si pas déjà chargé."""
if self.stt_engine is not None:
return True
try:
print("📥 Chargement du modèle STT...")
from moshi.models import loaders
ckpt = loaders.CheckpointInfo.from_hf_repo("kyutai/stt-1b-en_fr")
mimi = ckpt.get_mimi(device=self.device)
text_tok = ckpt.get_text_tokenizer()
lm = ckpt.get_moshi(device=self.device)
self.stt_engine = InferenceState(mimi, text_tok, lm, batch_size=1, device=self.device)
print("✅ Modèle STT chargé avec succès!")
return True
except Exception as e:
print(f"❌ Erreur chargement STT: {e}")
return False
def process_audio(self, audio_path: str, source_lang: str = "fr", target_lang: str = "en") -> Tuple[str, str, str]:
"""
Traite un fichier audio : transcription -> traduction -> synthèse vocale.
"""
if self.is_processing:
return "⚠️ Traitement en cours...", "", None
self.is_processing = True
try:
print(f"🎵 Traitement de l'audio: {audio_path}")
# 1. Transcription avec Whisper (avec langue source)
print(f"🎙️ Transcription en cours (langue: {source_lang})...")
transcription = self.stt_engine.transcribe(audio_path, language=source_lang)
if not transcription.strip():
return "⚠️ Aucune transcription détectée", "", None
print(f"📝 Texte transcrit: {transcription}")
# 2. Traduction
print("🌍 Traduction en cours...")
translation = self.translation_engine.translate(transcription, source_lang, target_lang)
if not translation.strip():
return transcription, "⚠️ Erreur de traduction", None
print(f"🌍 Texte traduit: {translation}")
# 3. Synthèse vocale (TTS)
print("🔊 Génération audio en cours...")
output_audio_path = self._generate_speech(translation, target_lang)
if output_audio_path:
print(f"✅ Traitement terminé!")
return transcription, translation, output_audio_path
except Exception as e:
print(f"❌ Erreur traitement: {e}")
import traceback
traceback.print_exc()
return f"❌ Erreur: {str(e)}", "", None
finally:
self.is_processing = False
def _generate_speech(self, text: str, target_lang: str) -> str:
"""Génère la synthèse vocale."""
try:
# Déterminer la voix selon la langue cible
voice = self.tts_engine.get_voice_for_language(target_lang)
print(f"🔊 Génération TTS: '{text}' -> {target_lang} (voix: {voice})")
# Générer l'audio
output_path = self.tts_engine.generate_speech(text, voice)
return output_path
except Exception as e:
print(f"❌ Erreur TTS: {e}")
import traceback
traceback.print_exc()
return None
# Instance globale de l'application
app = AudioTranslationApp()
@spaces.GPU
def process_audio_file(audio, source_lang, target_lang):
"""
Fonction Gradio pour traiter un fichier audio.
Utilise Zero GPU de Hugging Face Spaces.
Args:
audio: Tuple (sample_rate, audio_data) de Gradio
source_lang: Langue source
target_lang: Langue cible
Returns:
Tuple (transcription, traduction, audio_généré)
"""
if audio is None:
return "⚠️ Aucun audio fourni", "", None
try:
# Extraire les données audio
sample_rate, audio_data = audio
# Sauvegarder temporairement
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
temp_path = temp_file.name
sf.write(temp_path, audio_data, sample_rate)
# Traiter l'audio
transcription, translation, output_audio = app.process_audio(
temp_path, source_lang, target_lang
)
# Nettoyer le fichier temporaire
try:
os.unlink(temp_path)
except:
pass
return transcription, translation, output_audio
except Exception as e:
return f"❌ Erreur: {str(e)}", "", None
# Interface Gradio
def create_interface():
"""Crée l'interface Gradio."""
with gr.Blocks(
title="🎙️ Traduction Speech to Speech",
theme=gr.themes.Soft(),
css="""
.gradio-container {
max-width: 1000px !important;
margin: 0 auto;
}
.gradio-container > div {
display: flex !important;
flex-direction: column !important;
align-items: center !important;
}
.main-header {
text-align: center;
margin-bottom: 2rem;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 2rem;
border-radius: 15px;
box-shadow: 0 10px 30px rgba(0,0,0,0.1);
}
.main-header h1 {
margin: 0;
font-size: 2.5rem;
font-weight: 700;
}
.main-header p {
margin: 0.5rem 0 0 0;
font-size: 1.1rem;
opacity: 0.9;
}
.status-box {
background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
border: 2px solid #dee2e6;
border-radius: 12px;
padding: 1.5rem;
margin: 1rem 0;
box-shadow: 0 4px 15px rgba(0,0,0,0.05);
}
.status-box h4 {
margin: 0 0 0.5rem 0;
color: #495057;
font-weight: 600;
}
.audio-input {
border: 2px dashed #6c757d;
border-radius: 12px;
padding: 1rem;
background: #f8f9fa;
}
.process-button {
background: linear-gradient(135deg, #28a745 0%, #20c997 100%) !important;
border: none !important;
border-radius: 12px !important;
padding: 1rem 2rem !important;
font-size: 1.2rem !important;
font-weight: 600 !important;
box-shadow: 0 6px 20px rgba(40, 167, 69, 0.3) !important;
transition: all 0.3s ease !important;
}
.process-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 25px rgba(40, 167, 69, 0.4) !important;
}
.result-textbox {
background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%) !important;
border: 3px solid #e9ecef !important;
border-radius: 15px !important;
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif !important;
font-size: 1.1rem !important;
line-height: 1.8 !important;
padding: 1.5rem !important;
box-shadow: 0 8px 25px rgba(0,0,0,0.1) !important;
transition: all 0.3s ease !important;
}
.result-textbox:hover {
transform: translateY(-2px) !important;
box-shadow: 0 12px 35px rgba(0,0,0,0.15) !important;
}
.result-audio {
border: 3px solid #e9ecef !important;
border-radius: 15px !important;
background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%) !important;
padding: 1.5rem !important;
box-shadow: 0 8px 25px rgba(0,0,0,0.1) !important;
transition: all 0.3s ease !important;
}
.result-audio:hover {
transform: translateY(-2px) !important;
box-shadow: 0 12px 35px rgba(0,0,0,0.15) !important;
}
.gr-group {
background: white;
border: 1px solid #e9ecef;
border-radius: 15px;
padding: 1.5rem;
margin: 1rem auto;
box-shadow: 0 4px 15px rgba(0,0,0,0.05);
width: 100%;
max-width: 800px;
text-align: center;
}
.gr-tabs {
border-radius: 15px;
overflow: hidden;
box-shadow: 0 8px 25px rgba(0,0,0,0.1);
background: white;
}
.gr-tab {
background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
border: 2px solid #dee2e6;
border-radius: 12px;
margin: 0.5rem;
padding: 1rem 1.5rem;
font-weight: 600;
font-size: 1.1rem;
transition: all 0.3s ease;
box-shadow: 0 4px 15px rgba(0,0,0,0.05);
}
.gr-tab:hover {
transform: translateY(-2px);
box-shadow: 0 8px 25px rgba(0,0,0,0.15);
}
.gr-tab.selected {
background: linear-gradient(135deg, #007bff 0%, #0056b3 100%);
color: white;
border-color: #007bff;
box-shadow: 0 8px 25px rgba(0,123,255,0.3);
}
.gr-tabs .gr-tab:first-child {
background: linear-gradient(135deg, #28a745 0%, #20c997 100%);
color: white;
font-weight: 700;
border-color: #28a745;
box-shadow: 0 8px 25px rgba(40,167,69,0.3);
}
.gr-row {
justify-content: center !important;
}
.gr-column {
display: flex !important;
flex-direction: column !important;
align-items: center !important;
}
.gr-button {
margin: 0 auto !important;
}
.gr-dropdown {
margin: 0 auto !important;
}
.gr-audio {
margin: 0 auto !important;
}
.gr-textbox {
margin: 0 auto !important;
}
.centered-layout {
display: flex !important;
flex-direction: column !important;
align-items: center !important;
width: 100% !important;
}
.results-section {
background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%) !important;
border: 3px solid #e9ecef !important;
border-radius: 20px !important;
padding: 2rem !important;
box-shadow: 0 15px 40px rgba(0,0,0,0.1) !important;
margin: 2rem auto !important;
max-width: 900px !important;
}
.results-section h3 {
color: #495057 !important;
font-weight: 700 !important;
font-size: 1.5rem !important;
margin-bottom: 1.5rem !important;
text-align: center !important;
}
"""
) as interface:
# En-tête
gr.HTML("""
<div class="main-header">
<h1>🎙️ Speech-to-Speech Translator</h1>
<p>Record audio, get transcription, translation and generated audio</p>
</div>
""")
# Layout vertical amélioré et centré
with gr.Column(elem_classes=["centered-layout"]):
# Section 1: Audio Recording
with gr.Group():
gr.Markdown("### 🎵 Audio Recording")
gr.Markdown("⚠️ **Maximum 30 seconds** - Longer audio will be automatically truncated")
audio_input = gr.Audio(
label="Record your audio (max 30 seconds)",
type="numpy",
format="wav",
elem_classes=["audio-input"]
)
# Section 2: Language Configuration
with gr.Group():
gr.Markdown("### ⚙️ Language Configuration")
with gr.Row():
with gr.Column(scale=1):
source_lang = gr.Dropdown(
choices=["fr", "en"],
value="fr",
label="Source language",
info="Language of the recorded audio"
)
with gr.Column(scale=1):
target_lang = gr.Dropdown(
choices=["en", "fr"],
value="en",
label="Target language",
info="Translation language"
)
# Section 3: Process Button
with gr.Group():
process_btn = gr.Button(
"🚀 Process Audio",
variant="primary",
size="lg",
elem_classes=["process-button"]
)
# Section 4: Status
status = gr.HTML("""
<div class="status-box">
<h4>📊 Status</h4>
<p>✅ Ready to process audio</p>
</div>
""")
# Section 5: Results
with gr.Group(elem_classes=["results-section"]):
gr.Markdown("### 📝 Results")
with gr.Tabs():
with gr.Tab("🔊 Generated Audio"):
audio_output = gr.Audio(
label="Translation audio",
type="filepath",
elem_classes=["result-audio"]
)
with gr.Tab("🎙️ Transcription"):
transcription_output = gr.Textbox(
label="Transcribed text",
lines=4,
interactive=False,
elem_classes=["result-textbox"]
)
with gr.Tab("🌍 Translation"):
translation_output = gr.Textbox(
label="Translated text",
lines=4,
interactive=False,
elem_classes=["result-textbox"]
)
# Events
def update_status(message, color="green"):
return gr.HTML(f"""
<div class="status-box">
<h4>📊 Status</h4>
<p style="color: {color};">{message}</p>
</div>
""")
def process_audio_wrapper(audio, source_lang, target_lang):
if audio is None:
return (
"⚠️ No audio provided",
"",
None,
update_status("⚠️ No audio provided", "orange")
)
# Process audio
transcription, translation, output_audio = process_audio_file(
audio, source_lang, target_lang
)
# Update status based on result
if transcription and not transcription.startswith("❌"):
final_status = update_status("✅ Processing completed successfully!", "green")
else:
final_status = update_status("❌ Error during processing", "red")
return transcription, translation, output_audio, final_status
# Connect events
process_btn.click(
fn=process_audio_wrapper,
inputs=[audio_input, source_lang, target_lang],
outputs=[transcription_output, translation_output, audio_output, status],
show_progress=True
)
return interface
# Main entry point
if __name__ == "__main__":
# Create interface
demo = create_interface()
# Launch configuration based on environment
if IS_HUGGINGFACE_SPACES:
# Configuration for Hugging Face Spaces
print("🌐 Hugging Face Spaces configuration")
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True,
quiet=False,
debug=False
)
else:
# Configuration for local
import socket
def find_free_port(start_port=7860, max_port=7870):
"""Find a free port."""
for port in range(start_port, max_port):
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('127.0.0.1', port))
return port
except OSError:
continue
return start_port # Fallback
free_port = find_free_port()
print(f"🌐 Local server started on http://127.0.0.1:{free_port}")
demo.launch(
server_name="127.0.0.1", # Localhost for local
server_port=free_port,
share=False,
show_error=True,
quiet=False,
inbrowser=True # Automatically open browser
)