SeamlessOnDevice

Sleeping

Tonic commited on Nov 20, 2023

Commit

78e56be

1 Parent(s): 3cb13e6

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -36,7 +36,7 @@ def save_audio(audio_input, output_dir="saved_audio"):
 def speech_to_text(audio_data, tgt_lang):
     file_path = save_audio(audio_data)
     audio_input, _ = torchaudio.load(file_path)
-    s2t_model = torch.jit.load("unity_on_device_s2t.ptl")
     with torch.no_grad():
         text = s2t_model(audio_input, tgt_lang=languages[tgt_lang])
@@ -48,21 +48,15 @@ def speech_to_text(audio_data, tgt_lang):
 def speech_to_speech_translation(audio_data, tgt_lang):
     file_path = save_audio(audio_data)
     audio_input, _ = torchaudio.load(file_path)
-    s2st_model = torch.jit.load("unity_on_device_s2t.ptl")
     with torch.no_grad():
-        model_output = s2st_model(audio_input, tgt_lang=languages[tgt_lang])
-    # Print the model's output for debugging
-    print("Speech to Speech Translation Model Output:", model_output)
-    # Check the structure of model_output and unpack accordingly
-    if len(model_output) == 3:
-        text, units, waveform = model_output
-    elif len(model_output) == 2:
-        text, waveform = model_output
-        units = None  # or some default value
-    else:
-        raise ValueError("Unexpected model output format")
     output_file = "/tmp/result.wav"
     torchaudio.save(output_file, waveform.unsqueeze(0), sample_rate=16000)

 def speech_to_text(audio_data, tgt_lang):
     file_path = save_audio(audio_data)
     audio_input, _ = torchaudio.load(file_path)
+    s2t_model = torch.jit.load("unity_on_device.ptl")
     with torch.no_grad():
         text = s2t_model(audio_input, tgt_lang=languages[tgt_lang])
 def speech_to_speech_translation(audio_data, tgt_lang):
     file_path = save_audio(audio_data)
     audio_input, _ = torchaudio.load(file_path)
+    s2st_model = torch.jit.load("unity_on_device.ptl")
     with torch.no_grad():
+        text, units, waveform = s2st_model(audio_input, tgt_lang=languages[tgt_lang])
+    # Print the model's output for debugging (optional)
+    print("Translated Text:", text)
+    print("Units:", units)
+    print("Waveform Shape:", waveform.shape)
     output_file = "/tmp/result.wav"
     torchaudio.save(output_file, waveform.unsqueeze(0), sample_rate=16000)