Archime commited on
Commit
8417fa3
·
1 Parent(s): 7b84154

add task_fake

Browse files
Files changed (5) hide show
  1. app.py +146 -175
  2. app/canary_speech_engine.py +2 -0
  3. app/ui_utils.py +4 -4
  4. app/utils.py +218 -1
  5. assets/custom_style.css +51 -0
app.py CHANGED
@@ -14,10 +14,12 @@ from gradio.utils import get_space
14
 
15
  from app.utils import (
16
  raise_function,
 
17
  generate_coturn_config,
18
  read_and_stream_audio,
19
  stop_streaming,
20
- # task
 
21
  )
22
  from app.session_utils import (
23
  on_load,
@@ -53,106 +55,15 @@ reset_all_active_session_hash_code()
53
  theme,css_style = get_custom_theme()
54
 
55
  # logger.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
56
- from app.canary_speech_engine import CanarySpeechEngine,CanaryConfig
57
- from app.silero_vad_engine import Silero_Vad_Engine
58
- from app.streaming_audio_processor import StreamingAudioProcessor,StreamingAudioProcessorConfig
 
 
 
59
 
60
 
61
- asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/canary-1b-v2")
62
- streaming_audio_processor_config = StreamingAudioProcessorConfig(
63
- read_size=4000,
64
- silence_threshold_chunks=1
65
- )
66
 
67
- @spaces.GPU
68
- def task(session_id: str,
69
- task_type, lang_source, lang_target,
70
- chunk_secs, left_context_secs, right_context_secs,
71
- streaming_policy, alignatt_thr, waitk_lagging,
72
- exclude_sink_frames, xatt_scores_layer, hallucinations_detector
73
- ):
74
- """Continuously read and delete .npz chunks while task is active."""
75
- yield f"initializing the CanarySpeechEngine and Silero_Vad_Engine\n\n"
76
- # initialize the CanarySpeechEngine and Silero_Vad_Engine
77
- conf = CanaryConfig.from_params(
78
- task_type, lang_source, lang_target,
79
- chunk_secs, left_context_secs, right_context_secs,
80
- streaming_policy, alignatt_thr, waitk_lagging,
81
- exclude_sink_frames, xatt_scores_layer, hallucinations_detector
82
- )
83
- canary_speech_engine = CanarySpeechEngine(asr_model,conf)
84
- silero_vad_engine = Silero_Vad_Engine()
85
- streamer = StreamingAudioProcessor(speech_engine=canary_speech_engine,vad_engine=silero_vad_engine,cfg=streaming_audio_processor_config)
86
- yield f"initialized the CanarySpeechEngine and Silero_Vad_Engine\n\n"
87
- yield f"Task started for session {session_id}\n\n"
88
- active_flag = get_active_task_flag_file(session_id)
89
- with open(active_flag, "w") as f:
90
- f.write("1")
91
- chunk_dir = get_folder_chunks(session_id)
92
- logging.info(f"[{session_id}] task started. {chunk_dir}")
93
-
94
-
95
- try:
96
- logging.info(f"[{session_id}] task loop started.")
97
- yield f"Task started for session {session_id}\n\n"
98
- while os.path.exists(active_flag):
99
- if not os.path.exists(chunk_dir):
100
- logging.warning(f"[{session_id}] No chunk directory found for task.")
101
- yield "No audio chunks yet... waiting for stream.\n"
102
- time.sleep(0.1)
103
- continue
104
- files = sorted(f for f in os.listdir(chunk_dir) if f.endswith(".npz"))
105
- if not files:
106
- time.sleep(0.1)
107
- continue
108
-
109
- for fname in files:
110
- fpath = os.path.join(chunk_dir, fname)
111
- try:
112
- npz = np.load(fpath)
113
- samples = npz["data"]
114
- rate = int(npz["rate"])
115
-
116
- text = f"Transcribed {fname}: {len(samples)} samples @ {rate}Hz"
117
- new_texts = streamer.process_chunk(samples)
118
- for text in new_texts:
119
- print(text, end='', flush=True)
120
- yield f"{text}"
121
- logging.debug(f"[{session_id}] {new_texts}")
122
- # yield f"{text}\n"
123
- os.remove(fpath)
124
- logging.debug(f"[{session_id}] Deleted processed chunk: {fname}")
125
- except Exception as e:
126
- logging.error(f"[{session_id}] Error processing {fname}: {e}")
127
- yield f"Error processing {fname}: {e}\n"
128
- continue
129
-
130
- time.sleep(0.1)
131
- # raise_function()
132
- final_text = streamer.finalize_stream()
133
- if final_text:
134
- print(final_text, end='', flush=True)
135
- yield f"\n{final_text}"
136
- # yield f"\n"
137
- logging.info(f"[{session_id}] task loop ended (flag removed).")
138
-
139
- except Exception as e:
140
- logging.error(f"[{session_id}] task error: {e}", exc_info=True)
141
- yield f"Unexpected error: {e}\n"
142
- finally:
143
- # active_flag = os.path.join(TMP_DIR, f"transcribe_active_{session_id}.txt")
144
- if os.path.exists(active_flag):
145
- os.remove(active_flag)
146
- logging.info(f"[{session_id}] task stopped.")
147
- try:
148
- if os.path.exists(chunk_dir) and not os.listdir(chunk_dir):
149
- os.rmdir(chunk_dir)
150
- logging.debug(f"[{session_id}] Cleaned up empty chunk dir.")
151
- except Exception as e:
152
- logging.error(f"[{session_id}] Cleanup error: {e}")
153
- yield "\nCleanup error: {e}"
154
- logging.info(f"[{session_id}] Exiting task loop.")
155
- yield "\nTask finished and cleaned up.\n"
156
 
157
 
158
  with gr.Blocks(theme=theme, css=css_style) as demo:
@@ -227,46 +138,60 @@ with gr.Blocks(theme=theme, css=css_style) as demo:
227
  modality="audio",
228
  rtc_configuration=generate_coturn_config(),
229
  visible=True,
230
- inputs=main_audio
231
  )
232
- start_stream_button = gr.Button("Start Streaming")
233
 
234
  webrtc_stream.stream(
235
  fn=read_and_stream_audio,
236
- inputs=[active_filepath, session_hash_code, stop_streaming_flags,gr.State(streaming_audio_processor_config.read_size)],
237
  outputs=[webrtc_stream],
238
  trigger=start_stream_button.click,
239
  concurrency_id="audio_stream",
240
  concurrency_limit=10,
241
  )
242
  status_message_stream = gr.Markdown("", elem_id="status-message-stream", visible=False)
243
- go_to_config = gr.Button("Go to Configuration", visible=False)
244
  go_to_config.click(lambda: gr.Walkthrough(selected=2), outputs=walkthrough)
245
 
246
  # === STEP 3 ===
247
  with gr.Step("Configuration", id=2):
248
- gr.Markdown("## Step 3: Configure the Task")
249
-
250
- task_type = gr.Radio(["Transcription", "Translation"], value="Transcription", label="Task Type")
251
- lang_source = gr.Dropdown(list(SUPPORTED_LANGS_MAP.keys()), value="French", label="Source Language")
252
- lang_target = gr.Dropdown(list(SUPPORTED_LANGS_MAP.keys()), value="English", label="Target Language", visible=False)
253
-
 
254
  with gr.Accordion("Advanced Configuration", open=False):
255
- chunk_secs = gr.Number(value=1.0, label="chunk_secs", precision=1)
256
- left_context_secs = gr.Number(value=20.0, label="left_context_secs", precision=1)
257
- right_context_secs = gr.Number(value=0.5, label="right_context_secs", precision=1)
258
- streaming_policy = gr.Dropdown(["waitk", "alignatt"], value="waitk", label="decoding.streaming_policy")
259
- alignatt_thr = gr.Number(value=8, label="alignatt_thr", precision=0)
260
- waitk_lagging = gr.Number(value=2, label="waitk_lagging", precision=0)
261
- exclude_sink_frames = gr.Number(value=8, label="exclude_sink_frames", precision=0)
262
- xatt_scores_layer = gr.Number(value=-2, label="xatt_scores_layer", precision=0)
263
- hallucinations_detector = gr.Checkbox(value=True, label="hallucinations_detector")
264
-
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  with gr.Row():
266
  auto_apply_presets = gr.Checkbox(value=True, label="Auto-apply presets for sample audios")
267
  reset_btn = gr.Button("Reset to defaults")
268
-
269
- summary_box = gr.Textbox(label="Configuration Summary", lines=10, interactive=False)
270
 
271
  # --- Events ---
272
  task_type.change(
@@ -323,37 +248,47 @@ with gr.Blocks(theme=theme, css=css_style) as demo:
323
 
324
  # === STEP 4 ===
325
  with gr.Step("Task", id=3) as task_step:
326
- gr.Markdown("## Step 4: Start the Task")
327
- with gr.Group():
328
- with gr.Column():
329
- status_slider = gr.Slider(
330
- 0, 100,
331
- value=0,
332
- label="Streaming Progress",
333
- interactive=False,
334
- visible=False
335
- )
336
-
337
- stop_stream_button = gr.Button("Stop Streaming", visible=False)
338
-
339
- transcription_output = gr.Textbox(
 
 
 
 
 
 
 
340
  label="Transcription / Translation Result",
341
- placeholder="Waiting for output...",
342
  lines=10,
343
  max_lines= 10,
344
  interactive=False,
345
  visible=True,
346
- autoscroll=True
347
- )
348
-
349
- start_task_button = gr.Button("Start Task", visible=True)
350
- stop_task_button = gr.Button("Stop Task", visible=False)
 
 
 
351
 
352
  stop_stream_button.click(
353
- fn=stop_streaming,
354
- inputs=[session_hash_code, stop_streaming_flags],
355
- outputs=[stop_streaming_flags],
356
- )
357
 
358
  def stop_task_fn(session_hash_code):
359
  transcribe_active = get_active_task_flag_file(session_hash_code)
@@ -365,47 +300,83 @@ with gr.Blocks(theme=theme, css=css_style) as demo:
365
  stop_task_button.click(
366
  fn=stop_task_fn,
367
  inputs=session_hash_code,
368
- outputs=transcription_output
369
- )
370
- # task(session_hash_code)
371
-
 
 
 
372
  def start_transcription(
373
  session_hash_code, stop_streaming_flags,
374
  task_type, lang_source, lang_target,
375
  chunk_secs, left_context_secs, right_context_secs,
376
  streaming_policy, alignatt_thr, waitk_lagging,
377
  exclude_sink_frames, xatt_scores_layer, hallucinations_detector
378
- ):
379
  """Stream transcription or translation results in real time."""
380
 
381
  accumulated = ""
382
  yield f"Starting {task_type.lower()}...\n\n",gr.update(visible=False),gr.update(visible=True)
383
 
 
384
  # Boucle sur le générateur de `task()`
385
- for msg in task(
386
- session_hash_code,
387
- task_type, lang_source, lang_target,
388
- chunk_secs, left_context_secs, right_context_secs,
389
- streaming_policy, alignatt_thr, waitk_lagging,
390
- exclude_sink_frames, xatt_scores_layer, hallucinations_detector
391
- ):
392
  accumulated += msg
393
  yield accumulated,gr.update(visible=False),gr.update(visible=True)
394
 
395
  yield accumulated + "\nDone.",gr.update(visible=True),gr.update(visible=False)
396
 
397
- start_task_button.click(
398
- fn=start_transcription,
399
- inputs=[
400
- session_hash_code, stop_streaming_flags,
401
  task_type, lang_source, lang_target,
402
  chunk_secs, left_context_secs, right_context_secs,
403
  streaming_policy, alignatt_thr, waitk_lagging,
404
  exclude_sink_frames, xatt_scores_layer, hallucinations_detector
405
- ],
406
- outputs=[transcription_output,start_task_button,stop_task_button]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  )
408
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  ui_components = [
410
  start_stream_button, stop_stream_button,
411
  go_to_config, audio_source_step, status_slider,walkthrough,status_message_stream
@@ -419,19 +390,19 @@ with gr.Blocks(theme=theme, css=css_style) as demo:
419
  concurrency_limit=10,
420
  )
421
 
422
- # def toggle_task_buttons():
423
- # return (
424
- # gr.update(visible=False),
425
- # gr.update(visible=True),
426
- # gr.update(visible=True)
427
- # )
428
-
429
- # start_task_button.click(
430
- # fn=toggle_task_buttons,
431
- # inputs=None,
432
- # outputs=[start_task_button, stop_task_button, stop_stream_button],
433
- # queue=False
434
- # )
435
 
436
 
437
  if __name__ == "__main__":
 
14
 
15
  from app.utils import (
16
  raise_function,
17
+ READ_SIZE,
18
  generate_coturn_config,
19
  read_and_stream_audio,
20
  stop_streaming,
21
+ task,
22
+ task_fake
23
  )
24
  from app.session_utils import (
25
  on_load,
 
55
  theme,css_style = get_custom_theme()
56
 
57
  # logger.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
58
+
59
+ from app.streaming_audio_processor import StreamingAudioProcessorConfig
60
+
61
+
62
+ # asr_model = None
63
+
64
 
65
 
 
 
 
 
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
 
69
  with gr.Blocks(theme=theme, css=css_style) as demo:
 
138
  modality="audio",
139
  rtc_configuration=generate_coturn_config(),
140
  visible=True,
141
+ inputs=main_audio,
142
  )
143
+ start_stream_button = gr.Button("▶️ Start Streaming", variant="primary")
144
 
145
  webrtc_stream.stream(
146
  fn=read_and_stream_audio,
147
+ inputs=[active_filepath, session_hash_code, stop_streaming_flags,gr.State(READ_SIZE)],
148
  outputs=[webrtc_stream],
149
  trigger=start_stream_button.click,
150
  concurrency_id="audio_stream",
151
  concurrency_limit=10,
152
  )
153
  status_message_stream = gr.Markdown("", elem_id="status-message-stream", visible=False)
154
+ go_to_config = gr.Button("Go to Configuration", visible=False, variant="secondary")
155
  go_to_config.click(lambda: gr.Walkthrough(selected=2), outputs=walkthrough)
156
 
157
  # === STEP 3 ===
158
  with gr.Step("Configuration", id=2):
159
+ gr.Markdown("### Step 3: Configure the Task")
160
+ with gr.Group():
161
+ with gr.Row():
162
+ task_type = gr.Radio(["Transcription", "Translation"], value="Transcription", label="Task Type")
163
+ with gr.Row():
164
+ lang_source = gr.Dropdown(list(SUPPORTED_LANGS_MAP.keys()), value="French", label="Source Language")
165
+ lang_target = gr.Dropdown(list(SUPPORTED_LANGS_MAP.keys()), value="English", label="Target Language", visible=False)
166
  with gr.Accordion("Advanced Configuration", open=False):
167
+ with gr.Group():
168
+ with gr.Row():
169
+ gr.Markdown("##### Chunks ")
170
+ with gr.Row():
171
+ left_context_secs = gr.Slider(value=20.0, label="left_context_secs",info="Streaming chunk duration in seconds (left context)", minimum=1.0, maximum=60.0, step=1.0, show_reset_button=False)
172
+ chunk_secs = gr.Slider(value=1.0, label="chunk_secs", info="Streaming chunk duration in seconds (chunk)", minimum=0.1, maximum=5.0, step=0.1, show_reset_button=False)
173
+ right_context_secs = gr.Slider(value=0.5, label="right_context_secs", info="Streaming chunk duration in seconds (right context)", minimum=0.1, maximum=10.0, step=0.1, show_reset_button=False)
174
+ gr.Markdown("---")
175
+ with gr.Group():
176
+ with gr.Row():
177
+ gr.Markdown("##### Decoding ")
178
+ with gr.Row():
179
+ streaming_policy = gr.Dropdown(["waitk", "alignatt"], value="waitk", label="streaming_policy", elem_classes="full-width",
180
+ info="“Wait-k: Higher accuracy, requires larger left context, higher latency” \n”AlignAtt: Lower latency, suitable for production, predicts multiple tokens per chunk”")
181
+
182
+ with gr.Row():
183
+ alignatt_thr = gr.Number(value=8, label="alignatt_thr", info="Cross-attention threshold for AlignAtt policy (default: 8), alignatt only", precision=0)
184
+ waitk_lagging = gr.Number(value=2, label="waitk_lagging", info="Number of chunks to wait in the beginning (default: 2), works for both policies", precision=0)
185
+ with gr.Row():
186
+ exclude_sink_frames = gr.Number(value=8, label="exclude_sink_frames", info="Number of frames to exclude from the xatt scores calculation (default: 8), alignatt only", precision=0)
187
+ xatt_scores_layer = gr.Number(value=-2, label="xatt_scores_layer", info="Layer to get cross-attention (xatt) scores from (default: -2), alignatt only", precision=0)
188
+ with gr.Row():
189
+ hallucinations_detector = gr.Checkbox(value=True, label="hallucinations_detector" , info="Detect hallucinations in the predicted tokens (default: True), works for both policies" )
190
  with gr.Row():
191
  auto_apply_presets = gr.Checkbox(value=True, label="Auto-apply presets for sample audios")
192
  reset_btn = gr.Button("Reset to defaults")
193
+ with gr.Accordion("Configuration Summary", open=False):
194
+ summary_box = gr.Textbox(lines=15, interactive=False,show_label=False)
195
 
196
  # --- Events ---
197
  task_type.change(
 
248
 
249
  # === STEP 4 ===
250
  with gr.Step("Task", id=3) as task_step:
251
+ with gr.Row():
252
+ gr.Markdown("## Step 4: Start the Task")
253
+ with gr.Row():
254
+ with gr.Column():
255
+ status_slider = gr.Slider(
256
+ 0, 100,
257
+ value=0,
258
+ label="Streaming Progress",
259
+ show_label=True,
260
+ interactive=False,
261
+ visible=False,
262
+ show_reset_button=False
263
+ )
264
+ stop_stream_button = gr.Button("⏹️ Stop Streaming", visible=False,variant="stop")
265
+ with gr.Row():
266
+ gr.Markdown("---")
267
+ with gr.Row():
268
+ gr.Markdown("##### Transcription / Translation Result")
269
+ with gr.Row():
270
+
271
+ task_output = gr.Textbox(
272
  label="Transcription / Translation Result",
273
+ show_label=False,
274
  lines=10,
275
  max_lines= 10,
276
  interactive=False,
277
  visible=True,
278
+ autoscroll=True,
279
+ elem_id="task-output-box"
280
+ )
281
+ with gr.Row():
282
+ status_message_task = gr.Markdown("", elem_id="status-message-task",elem_classes=["info"], visible=False)
283
+ with gr.Row():
284
+ start_task_button = gr.Button("▶️ Start Task", visible=True, variant="primary")
285
+ stop_task_button = gr.Button("⏹️ Stop Task", visible=False,variant="stop")
286
 
287
  stop_stream_button.click(
288
+ fn=stop_streaming,
289
+ inputs=[session_hash_code, stop_streaming_flags],
290
+ outputs=[stop_streaming_flags],
291
+ )
292
 
293
  def stop_task_fn(session_hash_code):
294
  transcribe_active = get_active_task_flag_file(session_hash_code)
 
300
  stop_task_button.click(
301
  fn=stop_task_fn,
302
  inputs=session_hash_code,
303
+ outputs=task_output
304
+ )
305
+ # task(session_hash_code)
306
+ config_task_ui = [session_hash_code,task_type, lang_source, lang_target,
307
+ chunk_secs, left_context_secs, right_context_secs,
308
+ streaming_policy, alignatt_thr, waitk_lagging,
309
+ exclude_sink_frames, xatt_scores_layer, hallucinations_detector]
310
  def start_transcription(
311
  session_hash_code, stop_streaming_flags,
312
  task_type, lang_source, lang_target,
313
  chunk_secs, left_context_secs, right_context_secs,
314
  streaming_policy, alignatt_thr, waitk_lagging,
315
  exclude_sink_frames, xatt_scores_layer, hallucinations_detector
316
+ ):
317
  """Stream transcription or translation results in real time."""
318
 
319
  accumulated = ""
320
  yield f"Starting {task_type.lower()}...\n\n",gr.update(visible=False),gr.update(visible=True)
321
 
322
+
323
  # Boucle sur le générateur de `task()`
324
+ for msg in task(session_hash_code,config_task_ui):
 
 
 
 
 
 
325
  accumulated += msg
326
  yield accumulated,gr.update(visible=False),gr.update(visible=True)
327
 
328
  yield accumulated + "\nDone.",gr.update(visible=True),gr.update(visible=False)
329
 
330
+
331
+ def start_task(
332
+ session_hash_code,
 
333
  task_type, lang_source, lang_target,
334
  chunk_secs, left_context_secs, right_context_secs,
335
  streaming_policy, alignatt_thr, waitk_lagging,
336
  exclude_sink_frames, xatt_scores_layer, hallucinations_detector
337
+ ):
338
+ """Stream transcription or translation results in real time."""
339
+ accumulated = ""
340
+ # Boucle sur le générateur de `task2()`
341
+ for result, status, current_chunk in task_fake(
342
+ session_hash_code,
343
+ task_type, lang_source, lang_target,
344
+ chunk_secs, left_context_secs, right_context_secs,
345
+ streaming_policy, alignatt_thr, waitk_lagging,
346
+ exclude_sink_frames, xatt_scores_layer, hallucinations_detector
347
+ ):
348
+ if status == "success":
349
+ yield accumulated + result, gr.update(visible=True,value=current_chunk , elem_classes=["info"]), gr.update(visible=False), gr.update(visible=True)
350
+ accumulated += result
351
+ elif status in ["error", "warning", "info", "done"]:
352
+ yield accumulated, gr.update(visible=True,value=result , elem_classes=[status]), gr.update(visible=True), gr.update(visible=False)
353
+
354
+
355
+ start_task_button.click(
356
+ fn=start_task,
357
+ inputs=[
358
+ session_hash_code,
359
+ task_type, lang_source, lang_target,
360
+ chunk_secs, left_context_secs, right_context_secs,
361
+ streaming_policy, alignatt_thr, waitk_lagging,
362
+ exclude_sink_frames, xatt_scores_layer, hallucinations_detector
363
+
364
+ ],
365
+ outputs=[task_output,status_message_task,start_task_button,stop_task_button]
366
  )
367
 
368
+ # start_task_button.click(
369
+ # fn=start_task,
370
+ # inputs=[
371
+ # session_hash_code, stop_streaming_flags,
372
+ # task_type, lang_source, lang_target,
373
+ # chunk_secs, left_context_secs, right_context_secs,
374
+ # streaming_policy, alignatt_thr, waitk_lagging,
375
+ # exclude_sink_frames, xatt_scores_layer, hallucinations_detector
376
+ # ],
377
+ # outputs=[task_output,status_message_task,start_task_button,stop_task_button]
378
+ # )
379
+
380
  ui_components = [
381
  start_stream_button, stop_stream_button,
382
  go_to_config, audio_source_step, status_slider,walkthrough,status_message_stream
 
390
  concurrency_limit=10,
391
  )
392
 
393
+ # def toggle_task_buttons():
394
+ # return (
395
+ # gr.update(visible=False),
396
+ # gr.update(visible=True),
397
+ # gr.update(visible=True)
398
+ # )
399
+
400
+ # start_task_button.click(
401
+ # fn=toggle_task_buttons,
402
+ # inputs=None,
403
+ # outputs=[start_task_button, stop_task_button, stop_stream_button],
404
+ # queue=False
405
+ # )
406
 
407
 
408
  if __name__ == "__main__":
app/canary_speech_engine.py CHANGED
@@ -125,6 +125,7 @@ class CanaryConfig:
125
  """Create a CanaryConfig instance from parameters"""
126
  # Convert task type to model task
127
  task = "asr" if task_type == "Transcription" else "ast"
 
128
 
129
  return cls(
130
  chunk_secs=chunk_secs,
@@ -158,6 +159,7 @@ class CanarySpeechEngine(IStreamingSpeechEngine):
158
  Args:
159
  cfg: An OmegaConf object containing 'model' and 'streaming' configs.
160
  """
 
161
  self.cfg = cfg.toOmegaConf() # Store the full config
162
 
163
  # Setup device and dtype from config
 
125
  """Create a CanaryConfig instance from parameters"""
126
  # Convert task type to model task
127
  task = "asr" if task_type == "Transcription" else "ast"
128
+ target_lang = source_lang if task_type == "Transcription" else target_lang
129
 
130
  return cls(
131
  chunk_secs=chunk_secs,
 
159
  Args:
160
  cfg: An OmegaConf object containing 'model' and 'streaming' configs.
161
  """
162
+ logging.debug(f"Initializing CanarySpeechEngine with config: {cfg}")
163
  self.cfg = cfg.toOmegaConf() # Store the full config
164
 
165
  # Setup device and dtype from config
app/ui_utils.py CHANGED
@@ -26,10 +26,10 @@ EXAMPLE_CONFIGS = {
26
  "exclude_sink_frames": 8, "xatt_scores_layer": -2, "hallucinations_detector": True
27
  },
28
  "data/french_news.wav": {
29
- "task_type": "Transcription", "lang_source": "French", "lang_target": "English",
30
- "chunk_secs": 1.0, "left_context_secs": 15.0, "right_context_secs": 0.3,
31
- "streaming_policy": "alignatt", "alignatt_thr": 10, "waitk_lagging": 3,
32
- "exclude_sink_frames": 6, "xatt_scores_layer": -1, "hallucinations_detector": True
33
  },
34
  "data/spanish_podcast.wav": {
35
  "task_type": "Translation", "lang_source": "Spanish", "lang_target": "English",
 
26
  "exclude_sink_frames": 8, "xatt_scores_layer": -2, "hallucinations_detector": True
27
  },
28
  "data/french_news.wav": {
29
+ "task_type": "Transcription", "lang_source": "French", "lang_target": "French",
30
+ "chunk_secs": 1.0, "left_context_secs": 15.0, "right_context_secs": 0.5,
31
+ "streaming_policy": "alignatt", "alignatt_thr": 8.0, "waitk_lagging": 3,
32
+ "exclude_sink_frames": 8, "xatt_scores_layer": -2, "hallucinations_detector": True
33
  },
34
  "data/spanish_podcast.wav": {
35
  "task_type": "Translation", "lang_source": "Spanish", "lang_target": "English",
app/utils.py CHANGED
@@ -18,7 +18,14 @@ from app.session_utils import (
18
  get_active_task_flag_file,
19
  get_folder_chunks
20
  )
21
-
 
 
 
 
 
 
 
22
 
23
  # --------------------------------------------------------
24
  # Utility functions
@@ -120,6 +127,216 @@ def read_and_stream_audio(filepath_to_stream: str, session_id: str, stop_streami
120
 
121
 
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  def handle_stream_error(session_id: str, error: Exception | str, stop_streaming_flags: dict | None = None):
124
  """
125
  Handle streaming errors:
 
18
  get_active_task_flag_file,
19
  get_folder_chunks
20
  )
21
+ from app.ui_utils import (
22
+ SUPPORTED_LANGS_MAP
23
+ )
24
+ from app.canary_speech_engine import CanarySpeechEngine,CanaryConfig
25
+ from app.silero_vad_engine import Silero_Vad_Engine
26
+ from app.streaming_audio_processor import StreamingAudioProcessor,StreamingAudioProcessorConfig
27
+ import nemo.collections.asr as nemo_asr
28
+ READ_SIZE=4000
29
 
30
  # --------------------------------------------------------
31
  # Utility functions
 
127
 
128
 
129
 
130
+ # asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/canary-1b-v2")
131
+ asr_model = None
132
+
133
+ @spaces.GPU
134
+ def task_fake(session_id: str,
135
+ task_type, lang_source, lang_target,
136
+ chunk_secs, left_context_secs, right_context_secs,
137
+ streaming_policy, alignatt_thr, waitk_lagging,
138
+ exclude_sink_frames, xatt_scores_layer, hallucinations_detector
139
+ ):
140
+ """Continuously read and delete .npz chunks while task is active."""
141
+ global asr_model
142
+ yield ("initializing the CanarySpeechEngine and Silero_Vad_Engine", "info", None)
143
+ ### TODO
144
+ ##-----------
145
+ # conf = CanaryConfig.from_params(
146
+ # task_type, SUPPORTED_LANGS_MAP.get(lang_source),SUPPORTED_LANGS_MAP.get(lang_target) ,
147
+ # chunk_secs, left_context_secs, right_context_secs,
148
+ # streaming_policy, alignatt_thr, waitk_lagging,
149
+ # exclude_sink_frames, xatt_scores_layer, hallucinations_detector
150
+ # )
151
+
152
+ # canary_speech_engine = CanarySpeechEngine(asr_model,conf)
153
+ # silero_vad_engine = Silero_Vad_Engine()
154
+ # streaming_audio_processor_config = StreamingAudioProcessorConfig(
155
+ # read_size=READ_SIZE,
156
+ # silence_threshold_chunks=1
157
+ # )
158
+ # streamer = StreamingAudioProcessor(speech_engine=canary_speech_engine,vad_engine=silero_vad_engine,cfg=streaming_audio_processor_config)
159
+ ##-----------
160
+ yield ("initialized the CanarySpeechEngine and Silero_Vad_Engine", "info", None)
161
+ yield (f"Task started for session {session_id}", "info", None)
162
+
163
+ active_flag = get_active_task_flag_file(session_id)
164
+ with open(active_flag, "w") as f:
165
+ f.write("1")
166
+ chunk_dir = get_folder_chunks(session_id)
167
+ logging.info(f"[{session_id}] task started. {chunk_dir}")
168
+
169
+ try:
170
+ logging.info(f"[{session_id}] task loop started.")
171
+ yield (f"Task started for session {session_id}", "info", None)
172
+
173
+ while os.path.exists(active_flag):
174
+ if not os.path.exists(chunk_dir):
175
+ logging.warning(f"[{session_id}] No chunk directory found for task.")
176
+ yield ("No audio chunks yet... waiting for stream.", "warning", None)
177
+ time.sleep(0.1)
178
+ continue
179
+
180
+ files = sorted(f for f in os.listdir(chunk_dir) if f.endswith(".npz"))
181
+ if not files:
182
+ time.sleep(0.1)
183
+ continue
184
+
185
+ for fname in files:
186
+ fpath = os.path.join(chunk_dir, fname)
187
+ try:
188
+ npz = np.load(fpath)
189
+ samples = npz["data"]
190
+ rate = int(npz["rate"])
191
+ ##-----------
192
+ # new_texts = streamer.process_chunk(samples)
193
+ # for text in new_texts:
194
+ # print(text, end='', flush=True)
195
+ # yield (text, "success", text)
196
+ # logging.debug(f"[{session_id}] {new_texts}")
197
+ ##-----------
198
+ ### TODO
199
+ text = f"Transcribed {fname}: {len(samples)} samples @ {rate}Hz\n"
200
+ yield (text, "success", fname)
201
+ os.remove(fpath)
202
+ logging.debug(f"[{session_id}] Deleted processed chunk: {fname}")
203
+ except Exception as e:
204
+ logging.warning(f"[{session_id}] Error processing {fname}: {e}")
205
+ yield (f"Error processing {fname}: {e}", "warning", fname)
206
+ continue
207
+ time.sleep(0.1)
208
+
209
+ # TODO
210
+ ##-----------
211
+ # final_text = streamer.finalize_stream()
212
+ # yield (text, "success", final_text)
213
+ ##-----------
214
+ yield ("DONE", "done", None)
215
+ logging.info(f"[{session_id}] task loop ended (flag removed).")
216
+
217
+ except Exception as e:
218
+ logging.error(f"[{session_id}] task error: {e}", exc_info=True)
219
+ yield (f"Unexpected error: {e}", "error", None)
220
+
221
+ finally:
222
+ if os.path.exists(active_flag):
223
+ os.remove(active_flag)
224
+ logging.info(f"[{session_id}] task stopped.")
225
+
226
+ try:
227
+ if os.path.exists(chunk_dir) and not os.listdir(chunk_dir):
228
+ os.rmdir(chunk_dir)
229
+ logging.debug(f"[{session_id}] Cleaned up empty chunk dir.")
230
+ except Exception as e:
231
+ logging.error(f"[{session_id}] Cleanup error: {e}")
232
+ yield (f"Cleanup error: {e}", "error", None)
233
+
234
+ logging.info(f"[{session_id}] Exiting task loop.")
235
+ yield ("Task finished and cleaned up.", "done", None)
236
+
237
+
238
+
239
+ def task(session_id: str,
240
+ task_type, lang_source, lang_target,
241
+ chunk_secs, left_context_secs, right_context_secs,
242
+ streaming_policy, alignatt_thr, waitk_lagging,
243
+ exclude_sink_frames, xatt_scores_layer, hallucinations_detector
244
+ ):
245
+ """Continuously read and delete .npz chunks while task is active."""
246
+ global asr_model
247
+ yield ("initializing the CanarySpeechEngine and Silero_Vad_Engine", "info", None)
248
+ conf = CanaryConfig.from_params(
249
+ task_type, SUPPORTED_LANGS_MAP.get(lang_source),SUPPORTED_LANGS_MAP.get(lang_target) ,
250
+ chunk_secs, left_context_secs, right_context_secs,
251
+ streaming_policy, alignatt_thr, waitk_lagging,
252
+ exclude_sink_frames, xatt_scores_layer, hallucinations_detector
253
+ )
254
+
255
+ canary_speech_engine = CanarySpeechEngine(asr_model,conf)
256
+ silero_vad_engine = Silero_Vad_Engine()
257
+ streaming_audio_processor_config = StreamingAudioProcessorConfig(
258
+ read_size=READ_SIZE,
259
+ silence_threshold_chunks=1
260
+ )
261
+ streamer = StreamingAudioProcessor(speech_engine=canary_speech_engine,vad_engine=silero_vad_engine,cfg=streaming_audio_processor_config)
262
+ yield ("initialized the CanarySpeechEngine and Silero_Vad_Engine", "info", None)
263
+ yield (f"Task started for session {session_id}", "info", None)
264
+
265
+ active_flag = get_active_task_flag_file(session_id)
266
+ with open(active_flag, "w") as f:
267
+ f.write("1")
268
+ chunk_dir = get_folder_chunks(session_id)
269
+ logging.info(f"[{session_id}] task started. {chunk_dir}")
270
+
271
+ try:
272
+ logging.info(f"[{session_id}] task loop started.")
273
+ yield (f"Task started for session {session_id}", "info", None)
274
+
275
+ while os.path.exists(active_flag):
276
+ if not os.path.exists(chunk_dir):
277
+ logging.warning(f"[{session_id}] No chunk directory found for task.")
278
+ yield ("No audio chunks yet... waiting for stream.", "warning", None)
279
+ time.sleep(0.1)
280
+ continue
281
+
282
+ files = sorted(f for f in os.listdir(chunk_dir) if f.endswith(".npz"))
283
+ if not files:
284
+ time.sleep(0.1)
285
+ continue
286
+
287
+ for fname in files:
288
+ fpath = os.path.join(chunk_dir, fname)
289
+ try:
290
+ npz = np.load(fpath)
291
+ samples = npz["data"]
292
+ rate = int(npz["rate"])
293
+ new_texts = streamer.process_chunk(samples)
294
+ for text in new_texts:
295
+ print(text, end='', flush=True)
296
+ yield (text, "success", text)
297
+ logging.debug(f"[{session_id}] {new_texts}")
298
+ ### TODO
299
+ # text = f"Transcribed {fname}: {len(samples)} samples @ {rate}Hz\n"
300
+ # yield (text, "success", fname)
301
+ os.remove(fpath)
302
+ logging.debug(f"[{session_id}] Deleted processed chunk: {fname}")
303
+ except Exception as e:
304
+ logging.warning(f"[{session_id}] Error processing {fname}: {e}")
305
+ yield (f"Error processing {fname}: {e}", "warning", fname)
306
+ continue
307
+ time.sleep(0.1)
308
+
309
+ # TODO
310
+ final_text = streamer.finalize_stream()
311
+ yield (text, "success", final_text)
312
+ # if final_text:
313
+ # print(final_text, end='', flush=True)
314
+ # yield f"\n{final_text}"
315
+ ##
316
+ yield ("DONE", "done", None)
317
+ logging.info(f"[{session_id}] task loop ended (flag removed).")
318
+
319
+ except Exception as e:
320
+ logging.error(f"[{session_id}] task error: {e}", exc_info=True)
321
+ yield (f"Unexpected error: {e}", "error", None)
322
+
323
+ finally:
324
+ if os.path.exists(active_flag):
325
+ os.remove(active_flag)
326
+ logging.info(f"[{session_id}] task stopped.")
327
+
328
+ try:
329
+ if os.path.exists(chunk_dir) and not os.listdir(chunk_dir):
330
+ os.rmdir(chunk_dir)
331
+ logging.debug(f"[{session_id}] Cleaned up empty chunk dir.")
332
+ except Exception as e:
333
+ logging.error(f"[{session_id}] Cleanup error: {e}")
334
+ yield (f"Cleanup error: {e}", "error", None)
335
+
336
+ logging.info(f"[{session_id}] Exiting task loop.")
337
+ yield ("Task finished and cleaned up.", "done", None)
338
+
339
+
340
  def handle_stream_error(session_id: str, error: Exception | str, stop_streaming_flags: dict | None = None):
341
  """
342
  Handle streaming errors:
assets/custom_style.css CHANGED
@@ -144,4 +144,55 @@ body {
144
  padding: 0.75rem;
145
  color: #991B1B;
146
  font-weight: 500;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  }
 
144
  padding: 0.75rem;
145
  color: #991B1B;
146
  font-weight: 500;
147
+ }
148
+
149
+ #status-message-task {
150
+ padding: 0.75rem;
151
+ border-radius: 8px; /* Coins arrondis */
152
+ margin-top: 10px;
153
+ font-weight: 500; /* Un peu plus gras que la normale */
154
+ border: 1px solid transparent;
155
+ display: none; /* Caché par défaut */
156
+ }
157
+
158
+ /* Le style .info (bleu) */
159
+ #status-message-task.info{
160
+ color: #0c5464; /* Texte bleu foncé */
161
+ background-color: #d1ecf1; /* Fond bleu clair */
162
+ border-color: #bee5eb; /* Bordure bleue */
163
+ display: block; /* Le rend visible */
164
+ }
165
+
166
+ /* Le style .warning (jaune/orange) */
167
+ #status-message-task.warning {
168
+ color: #856404; /* Texte ocre */
169
+ background-color: #fff3cd; /* Fond jaune clair */
170
+ border-color: #ffeeba; /* Bordure jaune */
171
+ display: block; /* Le rend visible */
172
+ }
173
+
174
+ /* Le style .error (rouge) */
175
+ #status-message-task.error {
176
+ color: #721c24; /* Texte rouge foncé */
177
+ background-color: #f8d7da; /* Fond rouge clair */
178
+ border-color: #f5c6cb; /* Bordure rouge */
179
+ display: block; /* Le rend visible */
180
+ }
181
+
182
+ /* Styles personnalisés pour le WebRTC */
183
+ #webcam-stream {
184
+ border: 2px solid #007bff;
185
+ border-radius: 10px;
186
+ box-shadow: 0 4px 8px rgba(0,0,0,0.1);
187
+ background-color: #f8f9fa;
188
+ margin: 10px 0;
189
+ }
190
+
191
+ #webcam-stream .gr-webRTC {
192
+ background-color: #e9ecef;
193
+ }
194
+
195
+ #task-output-box textarea {
196
+ font-size: 1.15em; /* 'Moyenne taille' - ajustez au besoin */
197
+ font-weight: bold; /* 'En gras' */
198
  }