Spaces:
Runtime error
Runtime error
update app
Browse files
app.py
CHANGED
|
@@ -20,8 +20,6 @@ OWSM v3 has 889M parameters and is trained on 180k hours of paired speech data.
|
|
| 20 |
|
| 21 |
For more details, please check out our [paper](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
|
| 22 |
|
| 23 |
-
We also have a [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd6YmVeED6Cxy1QwT1mqv9O?usp=sharing) where you can use a free GPU.
|
| 24 |
-
|
| 25 |
```
|
| 26 |
@article{peng2023owsm,
|
| 27 |
title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
|
|
@@ -31,6 +29,9 @@ We also have a [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd
|
|
| 31 |
}
|
| 32 |
```
|
| 33 |
|
|
|
|
|
|
|
|
|
|
| 34 |
Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain language directions.
|
| 35 |
'''
|
| 36 |
|
|
@@ -114,7 +115,7 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
|
|
| 114 |
# ASR or ST
|
| 115 |
if long_form: # speech will be padded in decode_long()
|
| 116 |
try:
|
| 117 |
-
speech2text.maxlenratio =
|
| 118 |
utts = speech2text.decode_long(
|
| 119 |
speech,
|
| 120 |
segment_sec=_dur,
|
|
@@ -124,7 +125,7 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
|
|
| 124 |
start_time="<0.00>",
|
| 125 |
end_time_threshold="<29.50>",
|
| 126 |
)
|
| 127 |
-
|
| 128 |
text = []
|
| 129 |
for t1, t2, res in utts:
|
| 130 |
text.append(f"[{format_timestamp(seconds=t1)} --> {format_timestamp(seconds=t2)}] {res}")
|
|
@@ -132,9 +133,9 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
|
|
| 132 |
|
| 133 |
return code2lang[lang_code], text
|
| 134 |
except:
|
| 135 |
-
print("An exception occurred in long-form decoding.
|
| 136 |
|
| 137 |
-
speech2text.maxlenratio = -min(
|
| 138 |
speech = librosa.util.fix_length(speech, size=(_sr * _dur))
|
| 139 |
text = speech2text(speech, text_prev)[0][3]
|
| 140 |
|
|
@@ -144,11 +145,11 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
|
|
| 144 |
demo = gr.Interface(
|
| 145 |
predict,
|
| 146 |
inputs=[
|
| 147 |
-
gr.Audio(type="filepath", label="Speech
|
| 148 |
gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
|
| 149 |
gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
|
| 150 |
gr.Slider(minimum=1, maximum=5, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
|
| 151 |
-
gr.Checkbox(label="Long Form (Experimental)", info="
|
| 152 |
gr.Text(label="Text Prompt (Optional)", info="Generation will be conditioned on this prompt if provided"),
|
| 153 |
],
|
| 154 |
outputs=[
|
|
|
|
| 20 |
|
| 21 |
For more details, please check out our [paper](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
|
| 22 |
|
|
|
|
|
|
|
| 23 |
```
|
| 24 |
@article{peng2023owsm,
|
| 25 |
title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
|
|
|
|
| 29 |
}
|
| 30 |
```
|
| 31 |
|
| 32 |
+
As a demo, the input speech should not exceed 2 minutes. We also limit the maximum number of tokens to be generated.
|
| 33 |
+
Please try our [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd6YmVeED6Cxy1QwT1mqv9O?usp=sharing) if you want to explore more features.
|
| 34 |
+
|
| 35 |
Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain language directions.
|
| 36 |
'''
|
| 37 |
|
|
|
|
| 115 |
# ASR or ST
|
| 116 |
if long_form: # speech will be padded in decode_long()
|
| 117 |
try:
|
| 118 |
+
speech2text.maxlenratio = -300
|
| 119 |
utts = speech2text.decode_long(
|
| 120 |
speech,
|
| 121 |
segment_sec=_dur,
|
|
|
|
| 125 |
start_time="<0.00>",
|
| 126 |
end_time_threshold="<29.50>",
|
| 127 |
)
|
| 128 |
+
|
| 129 |
text = []
|
| 130 |
for t1, t2, res in utts:
|
| 131 |
text.append(f"[{format_timestamp(seconds=t1)} --> {format_timestamp(seconds=t2)}] {res}")
|
|
|
|
| 133 |
|
| 134 |
return code2lang[lang_code], text
|
| 135 |
except:
|
| 136 |
+
print("An exception occurred in long-form decoding. Fall back to standard decoding (only first 30s)")
|
| 137 |
|
| 138 |
+
speech2text.maxlenratio = -min(300, int((len(speech) / rate) * 10)) # assuming 10 tokens per second
|
| 139 |
speech = librosa.util.fix_length(speech, size=(_sr * _dur))
|
| 140 |
text = speech2text(speech, text_prev)[0][3]
|
| 141 |
|
|
|
|
| 145 |
demo = gr.Interface(
|
| 146 |
predict,
|
| 147 |
inputs=[
|
| 148 |
+
gr.Audio(type="filepath", label="Input Speech (<120s)", max_length=120, sources=["microphone", "upload"], show_download_button=True, show_share_button=True,),
|
| 149 |
gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
|
| 150 |
gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
|
| 151 |
gr.Slider(minimum=1, maximum=5, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
|
| 152 |
+
gr.Checkbox(label="Long Form (Experimental)", info="Perform long-form decoding for audios that are longer than 30s. If an exception happens, it will fall back to standard decoding on the initial 30s."),
|
| 153 |
gr.Text(label="Text Prompt (Optional)", info="Generation will be conditioned on this prompt if provided"),
|
| 154 |
],
|
| 155 |
outputs=[
|