Spaces:

Archime
/

canary_aed_streaming

Running on Zero

App Files Files Community

Archime commited on Nov 17

Commit

8417fa3

1 Parent(s): 7b84154

add task_fake

Browse files

Files changed (5) hide show

app.py +146 -175
app/canary_speech_engine.py +2 -0
app/ui_utils.py +4 -4
app/utils.py +218 -1
assets/custom_style.css +51 -0

app.py CHANGED Viewed

@@ -14,10 +14,12 @@ from gradio.utils import get_space
 from app.utils import (
     raise_function,
     generate_coturn_config,
     read_and_stream_audio,
     stop_streaming,
-    # task
 )
 from app.session_utils import (
     on_load,
@@ -53,106 +55,15 @@ reset_all_active_session_hash_code()
 theme,css_style = get_custom_theme()
 # logger.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
-from app.canary_speech_engine import CanarySpeechEngine,CanaryConfig
-from app.silero_vad_engine import Silero_Vad_Engine
-from app.streaming_audio_processor import StreamingAudioProcessor,StreamingAudioProcessorConfig
-asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/canary-1b-v2")
-streaming_audio_processor_config = StreamingAudioProcessorConfig(
-    read_size=4000,
-    silence_threshold_chunks=1
-)
-@spaces.GPU
-def task(session_id: str,
-        task_type, lang_source, lang_target,
-        chunk_secs, left_context_secs, right_context_secs,
-        streaming_policy, alignatt_thr, waitk_lagging,
-        exclude_sink_frames, xatt_scores_layer, hallucinations_detector
-         ):
-    """Continuously read and delete .npz chunks while task is active."""
-    yield f"initializing the CanarySpeechEngine and Silero_Vad_Engine\n\n"
-    # initialize the CanarySpeechEngine and Silero_Vad_Engine
-    conf = CanaryConfig.from_params(
-        task_type, lang_source, lang_target,
-        chunk_secs, left_context_secs, right_context_secs,
-        streaming_policy, alignatt_thr, waitk_lagging,
-        exclude_sink_frames, xatt_scores_layer, hallucinations_detector
-    )
-    canary_speech_engine = CanarySpeechEngine(asr_model,conf)
-    silero_vad_engine = Silero_Vad_Engine()
-    streamer = StreamingAudioProcessor(speech_engine=canary_speech_engine,vad_engine=silero_vad_engine,cfg=streaming_audio_processor_config)
-    yield f"initialized the CanarySpeechEngine and Silero_Vad_Engine\n\n"
-    yield f"Task started for session {session_id}\n\n"
-    active_flag = get_active_task_flag_file(session_id)
-    with open(active_flag, "w") as f:
-        f.write("1")
-    chunk_dir = get_folder_chunks(session_id)
-    logging.info(f"[{session_id}] task started. {chunk_dir}")
-    try:
-        logging.info(f"[{session_id}] task loop started.")
-        yield f"Task started for session {session_id}\n\n"
-        while os.path.exists(active_flag):
-            if not os.path.exists(chunk_dir):
-                logging.warning(f"[{session_id}] No chunk directory found for task.")
-                yield "No audio chunks yet... waiting for stream.\n"
-                time.sleep(0.1)
-                continue
-            files = sorted(f for f in os.listdir(chunk_dir) if f.endswith(".npz"))
-            if not files:
-                time.sleep(0.1)
-                continue
-            for fname in files:
-                fpath = os.path.join(chunk_dir, fname)
-                try:
-                    npz = np.load(fpath)
-                    samples = npz["data"]
-                    rate = int(npz["rate"])
-                    text = f"Transcribed {fname}: {len(samples)} samples @ {rate}Hz"
-                    new_texts = streamer.process_chunk(samples)
-                    for text in new_texts:
-                        print(text, end='', flush=True)
-                        yield f"{text}"
-                        logging.debug(f"[{session_id}] {new_texts}")
-                    # yield f"{text}\n"
-                    os.remove(fpath)
-                    logging.debug(f"[{session_id}] Deleted processed chunk: {fname}")
-                except Exception as e:
-                    logging.error(f"[{session_id}] Error processing {fname}: {e}")
-                    yield f"Error processing {fname}: {e}\n"
-                    continue
-            time.sleep(0.1)
-            # raise_function()
-        final_text = streamer.finalize_stream()
-        if final_text:
-            print(final_text, end='', flush=True)
-        yield f"\n{final_text}"
-        # yield f"\n"
-        logging.info(f"[{session_id}] task loop ended (flag removed).")
-    except Exception as e:
-        logging.error(f"[{session_id}] task error: {e}", exc_info=True)
-        yield f"Unexpected error: {e}\n"
-    finally:
-        # active_flag = os.path.join(TMP_DIR, f"transcribe_active_{session_id}.txt")
-        if os.path.exists(active_flag):
-            os.remove(active_flag)
-        logging.info(f"[{session_id}] task stopped.")
-        try:
-            if os.path.exists(chunk_dir) and not os.listdir(chunk_dir):
-                os.rmdir(chunk_dir)
-                logging.debug(f"[{session_id}] Cleaned up empty chunk dir.")
-        except Exception as e:
-            logging.error(f"[{session_id}] Cleanup error: {e}")
-            yield "\nCleanup error: {e}"
-        logging.info(f"[{session_id}] Exiting task loop.")
-        yield "\nTask finished and cleaned up.\n"
 with gr.Blocks(theme=theme, css=css_style) as demo:
@@ -227,46 +138,60 @@ with gr.Blocks(theme=theme, css=css_style) as demo:
                         modality="audio",
                         rtc_configuration=generate_coturn_config(),
                         visible=True,
-                        inputs=main_audio
                     )
-            start_stream_button = gr.Button("Start Streaming")
             webrtc_stream.stream(
                 fn=read_and_stream_audio,
-                inputs=[active_filepath, session_hash_code, stop_streaming_flags,gr.State(streaming_audio_processor_config.read_size)],
                 outputs=[webrtc_stream],
                 trigger=start_stream_button.click,
                 concurrency_id="audio_stream",
                 concurrency_limit=10,
             )
             status_message_stream = gr.Markdown("", elem_id="status-message-stream", visible=False)
-            go_to_config = gr.Button("Go to Configuration", visible=False)
             go_to_config.click(lambda: gr.Walkthrough(selected=2), outputs=walkthrough)
         # === STEP 3 ===
         with gr.Step("Configuration", id=2):
-            gr.Markdown("## Step 3: Configure the Task")
-            task_type = gr.Radio(["Transcription", "Translation"], value="Transcription", label="Task Type")
-            lang_source = gr.Dropdown(list(SUPPORTED_LANGS_MAP.keys()), value="French", label="Source Language")
-            lang_target = gr.Dropdown(list(SUPPORTED_LANGS_MAP.keys()), value="English", label="Target Language", visible=False)
             with gr.Accordion("Advanced Configuration", open=False):
-                chunk_secs = gr.Number(value=1.0, label="chunk_secs", precision=1)
-                left_context_secs = gr.Number(value=20.0, label="left_context_secs", precision=1)
-                right_context_secs = gr.Number(value=0.5, label="right_context_secs", precision=1)
-                streaming_policy = gr.Dropdown(["waitk", "alignatt"], value="waitk", label="decoding.streaming_policy")
-                alignatt_thr = gr.Number(value=8, label="alignatt_thr", precision=0)
-                waitk_lagging = gr.Number(value=2, label="waitk_lagging", precision=0)
-                exclude_sink_frames = gr.Number(value=8, label="exclude_sink_frames", precision=0)
-                xatt_scores_layer = gr.Number(value=-2, label="xatt_scores_layer", precision=0)
-                hallucinations_detector = gr.Checkbox(value=True, label="hallucinations_detector")
             with gr.Row():
                 auto_apply_presets = gr.Checkbox(value=True, label="Auto-apply presets for sample audios")
                 reset_btn = gr.Button("Reset to defaults")
-            summary_box = gr.Textbox(label="Configuration Summary", lines=10, interactive=False)
             # --- Events ---
             task_type.change(
@@ -323,37 +248,47 @@ with gr.Blocks(theme=theme, css=css_style) as demo:
         # === STEP 4 ===
         with gr.Step("Task", id=3) as task_step:
-            gr.Markdown("## Step 4: Start the Task")
-            with gr.Group():
-                with gr.Column():
-                    status_slider = gr.Slider(
-                        0, 100,
-                        value=0,
-                        label="Streaming Progress",
-                        interactive=False,
-                        visible=False
-                    )
-                    stop_stream_button = gr.Button("Stop Streaming", visible=False)
-                    transcription_output = gr.Textbox(
                         label="Transcription / Translation Result",
-                        placeholder="Waiting for output...",
                         lines=10,
                         max_lines= 10,
                         interactive=False,
                         visible=True,
-                        autoscroll=True
-                    )
-                    start_task_button = gr.Button("Start Task", visible=True)
-                    stop_task_button = gr.Button("Stop Task", visible=False)
                     stop_stream_button.click(
-                        fn=stop_streaming,
-                        inputs=[session_hash_code, stop_streaming_flags],
-                        outputs=[stop_streaming_flags],
-                    )
                     def stop_task_fn(session_hash_code):
                         transcribe_active = get_active_task_flag_file(session_hash_code)
@@ -365,47 +300,83 @@ with gr.Blocks(theme=theme, css=css_style) as demo:
                     stop_task_button.click(
                         fn=stop_task_fn,
                         inputs=session_hash_code,
-                        outputs=transcription_output
-                    )
-                        # task(session_hash_code)
                     def start_transcription(
                         session_hash_code, stop_streaming_flags,
                         task_type, lang_source, lang_target,
                         chunk_secs, left_context_secs, right_context_secs,
                         streaming_policy, alignatt_thr, waitk_lagging,
                         exclude_sink_frames, xatt_scores_layer, hallucinations_detector
-                    ):
                         """Stream transcription or translation results in real time."""
                         accumulated = ""
                         yield f"Starting {task_type.lower()}...\n\n",gr.update(visible=False),gr.update(visible=True)
                         # Boucle sur le générateur de `task()`
-                        for msg in task(
-                            session_hash_code,
-                            task_type, lang_source, lang_target,
-                            chunk_secs, left_context_secs, right_context_secs,
-                            streaming_policy, alignatt_thr, waitk_lagging,
-                            exclude_sink_frames, xatt_scores_layer, hallucinations_detector
-                                        ):
                             accumulated += msg
                             yield accumulated,gr.update(visible=False),gr.update(visible=True)
                         yield accumulated + "\nDone.",gr.update(visible=True),gr.update(visible=False)
-                    start_task_button.click(
-                        fn=start_transcription,
-                        inputs=[
-                            session_hash_code, stop_streaming_flags,
                             task_type, lang_source, lang_target,
                             chunk_secs, left_context_secs, right_context_secs,
                             streaming_policy, alignatt_thr, waitk_lagging,
                             exclude_sink_frames, xatt_scores_layer, hallucinations_detector
-                        ],
-                        outputs=[transcription_output,start_task_button,stop_task_button]
                     )
                     ui_components = [
                         start_stream_button, stop_stream_button,
                         go_to_config, audio_source_step, status_slider,walkthrough,status_message_stream
@@ -419,19 +390,19 @@ with gr.Blocks(theme=theme, css=css_style) as demo:
                         concurrency_limit=10,
                     )
-                    # def toggle_task_buttons():
-                    #     return (
-                    #         gr.update(visible=False),
-                    #         gr.update(visible=True),
-                    #         gr.update(visible=True)
-                    #     )
-                    # start_task_button.click(
-                    #     fn=toggle_task_buttons,
-                    #     inputs=None,
-                    #     outputs=[start_task_button, stop_task_button, stop_stream_button],
-                    #     queue=False
-                    # )
 if __name__ == "__main__":

 from app.utils import (
     raise_function,
+    READ_SIZE,
     generate_coturn_config,
     read_and_stream_audio,
     stop_streaming,
+    task,
+    task_fake
 )
 from app.session_utils import (
     on_load,
 theme,css_style = get_custom_theme()
 # logger.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+from app.streaming_audio_processor import StreamingAudioProcessorConfig
+# asr_model = None
 with gr.Blocks(theme=theme, css=css_style) as demo:
                         modality="audio",
                         rtc_configuration=generate_coturn_config(),
                         visible=True,
+                        inputs=main_audio,
                     )
+            start_stream_button = gr.Button("▶️ Start Streaming", variant="primary")
             webrtc_stream.stream(
                 fn=read_and_stream_audio,
+                inputs=[active_filepath, session_hash_code, stop_streaming_flags,gr.State(READ_SIZE)],
                 outputs=[webrtc_stream],
                 trigger=start_stream_button.click,
                 concurrency_id="audio_stream",
                 concurrency_limit=10,
             )
             status_message_stream = gr.Markdown("", elem_id="status-message-stream", visible=False)
+            go_to_config = gr.Button("Go to Configuration", visible=False, variant="secondary")
             go_to_config.click(lambda: gr.Walkthrough(selected=2), outputs=walkthrough)
         # === STEP 3 ===
         with gr.Step("Configuration", id=2):
+            gr.Markdown("### Step 3: Configure the Task")
+            with gr.Group():
+                with gr.Row():
+                    task_type = gr.Radio(["Transcription", "Translation"], value="Transcription", label="Task Type")
+                with gr.Row():
+                    lang_source = gr.Dropdown(list(SUPPORTED_LANGS_MAP.keys()), value="French", label="Source Language")
+                    lang_target = gr.Dropdown(list(SUPPORTED_LANGS_MAP.keys()), value="English", label="Target Language", visible=False)
             with gr.Accordion("Advanced Configuration", open=False):
+                with gr.Group():
+                    with gr.Row():
+                        gr.Markdown("##### Chunks ")
+                    with gr.Row():
+                        left_context_secs = gr.Slider(value=20.0, label="left_context_secs",info="Streaming chunk duration in seconds (left context)", minimum=1.0, maximum=60.0, step=1.0, show_reset_button=False)
+                        chunk_secs = gr.Slider(value=1.0, label="chunk_secs", info="Streaming chunk duration in seconds (chunk)", minimum=0.1, maximum=5.0, step=0.1, show_reset_button=False)
+                        right_context_secs = gr.Slider(value=0.5, label="right_context_secs", info="Streaming chunk duration in seconds (right context)", minimum=0.1, maximum=10.0, step=0.1, show_reset_button=False)
+                gr.Markdown("---")
+                with gr.Group():
+                    with gr.Row():
+                        gr.Markdown("##### Decoding ")
+                    with gr.Row():
+                        streaming_policy = gr.Dropdown(["waitk", "alignatt"], value="waitk", label="streaming_policy",  elem_classes="full-width",
+                                                           info="“Wait-k: Higher accuracy, requires larger left context, higher latency” \n”AlignAtt: Lower latency, suitable for production, predicts multiple tokens per chunk”")
+                    with gr.Row():
+                        alignatt_thr = gr.Number(value=8, label="alignatt_thr", info="Cross-attention threshold for AlignAtt policy (default: 8), alignatt only",  precision=0)
+                        waitk_lagging = gr.Number(value=2, label="waitk_lagging", info="Number of chunks to wait in the beginning (default: 2), works for both policies",  precision=0)
+                    with gr.Row():
+                        exclude_sink_frames = gr.Number(value=8, label="exclude_sink_frames", info="Number of frames to exclude from the xatt scores calculation (default: 8), alignatt only", precision=0)
+                        xatt_scores_layer = gr.Number(value=-2, label="xatt_scores_layer", info="Layer to get cross-attention (xatt) scores from (default: -2), alignatt only", precision=0)
+                    with gr.Row():
+                        hallucinations_detector = gr.Checkbox(value=True, label="hallucinations_detector" , info="Detect hallucinations in the predicted tokens (default: True), works for both policies" )
             with gr.Row():
                 auto_apply_presets = gr.Checkbox(value=True, label="Auto-apply presets for sample audios")
                 reset_btn = gr.Button("Reset to defaults")
+            with gr.Accordion("Configuration Summary", open=False):
+                summary_box = gr.Textbox(lines=15, interactive=False,show_label=False)
             # --- Events ---
             task_type.change(
         # === STEP 4 ===
         with gr.Step("Task", id=3) as task_step:
+                with gr.Row():
+                    gr.Markdown("## Step 4: Start the Task")
+                with gr.Row():
+                    with gr.Column():
+                        status_slider = gr.Slider(
+                            0, 100,
+                            value=0,
+                            label="Streaming Progress",
+                            show_label=True,
+                            interactive=False,
+                            visible=False,
+                            show_reset_button=False
+                        )
+                        stop_stream_button = gr.Button("⏹️ Stop Streaming", visible=False,variant="stop")
+                with gr.Row():
+                    gr.Markdown("---")
+                with gr.Row():
+                    gr.Markdown("##### Transcription / Translation Result")
+                with gr.Row():
+                    task_output = gr.Textbox(
                         label="Transcription / Translation Result",
+                        show_label=False,
                         lines=10,
                         max_lines= 10,
                         interactive=False,
                         visible=True,
+                        autoscroll=True,
+                        elem_id="task-output-box"
+                        )
+                with gr.Row():
+                    status_message_task = gr.Markdown("", elem_id="status-message-task",elem_classes=["info"], visible=False)
+                with gr.Row():
+                    start_task_button = gr.Button("▶️ Start Task", visible=True, variant="primary")
+                    stop_task_button = gr.Button("⏹️ Stop Task", visible=False,variant="stop")
                     stop_stream_button.click(
+                            fn=stop_streaming,
+                            inputs=[session_hash_code, stop_streaming_flags],
+                            outputs=[stop_streaming_flags],
+                        )
                     def stop_task_fn(session_hash_code):
                         transcribe_active = get_active_task_flag_file(session_hash_code)
                     stop_task_button.click(
                         fn=stop_task_fn,
                         inputs=session_hash_code,
+                            outputs=task_output
+                        )
+                            # task(session_hash_code)
+                    config_task_ui = [session_hash_code,task_type, lang_source, lang_target,
+                            chunk_secs, left_context_secs, right_context_secs,
+                            streaming_policy, alignatt_thr, waitk_lagging,
+                            exclude_sink_frames, xatt_scores_layer, hallucinations_detector]
                     def start_transcription(
                         session_hash_code, stop_streaming_flags,
                         task_type, lang_source, lang_target,
                         chunk_secs, left_context_secs, right_context_secs,
                         streaming_policy, alignatt_thr, waitk_lagging,
                         exclude_sink_frames, xatt_scores_layer, hallucinations_detector
+                        ):
                         """Stream transcription or translation results in real time."""
                         accumulated = ""
                         yield f"Starting {task_type.lower()}...\n\n",gr.update(visible=False),gr.update(visible=True)
                         # Boucle sur le générateur de `task()`
+                        for msg in task(session_hash_code,config_task_ui):
                             accumulated += msg
                             yield accumulated,gr.update(visible=False),gr.update(visible=True)
                         yield accumulated + "\nDone.",gr.update(visible=True),gr.update(visible=False)
+                    def start_task(
+                            session_hash_code,
                             task_type, lang_source, lang_target,
                             chunk_secs, left_context_secs, right_context_secs,
                             streaming_policy, alignatt_thr, waitk_lagging,
                             exclude_sink_frames, xatt_scores_layer, hallucinations_detector
+                                   ):
+                        """Stream transcription or translation results in real time."""
+                        accumulated = ""
+                        # Boucle sur le générateur de `task2()`
+                        for result, status, current_chunk in task_fake(
+                            session_hash_code,
+                            task_type, lang_source, lang_target,
+                            chunk_secs, left_context_secs, right_context_secs,
+                            streaming_policy, alignatt_thr, waitk_lagging,
+                            exclude_sink_frames, xatt_scores_layer, hallucinations_detector
+                            ):
+                            if status == "success":
+                                yield accumulated + result, gr.update(visible=True,value=current_chunk , elem_classes=["info"]), gr.update(visible=False), gr.update(visible=True)
+                                accumulated += result
+                            elif status in ["error", "warning", "info", "done"]:
+                                yield accumulated, gr.update(visible=True,value=result , elem_classes=[status]), gr.update(visible=True), gr.update(visible=False)
+                    start_task_button.click(
+                            fn=start_task,
+                            inputs=[
+                                session_hash_code,
+                                task_type, lang_source, lang_target,
+                                chunk_secs, left_context_secs, right_context_secs,
+                                streaming_policy, alignatt_thr, waitk_lagging,
+                                exclude_sink_frames, xatt_scores_layer, hallucinations_detector
+                            ],
+                            outputs=[task_output,status_message_task,start_task_button,stop_task_button]
                     )
+                        # start_task_button.click(
+                        #     fn=start_task,
+                        #     inputs=[
+                        #         session_hash_code, stop_streaming_flags,
+                        #         task_type, lang_source, lang_target,
+                        #         chunk_secs, left_context_secs, right_context_secs,
+                        #         streaming_policy, alignatt_thr, waitk_lagging,
+                        #         exclude_sink_frames, xatt_scores_layer, hallucinations_detector
+                        #     ],
+                        #     outputs=[task_output,status_message_task,start_task_button,stop_task_button]
+                        # )
                     ui_components = [
                         start_stream_button, stop_stream_button,
                         go_to_config, audio_source_step, status_slider,walkthrough,status_message_stream
                         concurrency_limit=10,
                     )
+                        # def toggle_task_buttons():
+                        #     return (
+                        #         gr.update(visible=False),
+                        #         gr.update(visible=True),
+                        #         gr.update(visible=True)
+                        #     )
+                        # start_task_button.click(
+                        #     fn=toggle_task_buttons,
+                        #     inputs=None,
+                        #     outputs=[start_task_button, stop_task_button, stop_stream_button],
+                        #     queue=False
+                        # )
 if __name__ == "__main__":

app/canary_speech_engine.py CHANGED Viewed

@@ -125,6 +125,7 @@ class CanaryConfig:
         """Create a CanaryConfig instance from parameters"""
         # Convert task type to model task
         task = "asr" if task_type == "Transcription" else "ast"
         return cls(
             chunk_secs=chunk_secs,
@@ -158,6 +159,7 @@ class CanarySpeechEngine(IStreamingSpeechEngine):
         Args:
             cfg: An OmegaConf object containing 'model' and 'streaming' configs.
         """
         self.cfg = cfg.toOmegaConf() # Store the full config
         # Setup device and dtype from config

         """Create a CanaryConfig instance from parameters"""
         # Convert task type to model task
         task = "asr" if task_type == "Transcription" else "ast"
+        target_lang = source_lang if task_type == "Transcription" else target_lang
         return cls(
             chunk_secs=chunk_secs,
         Args:
             cfg: An OmegaConf object containing 'model' and 'streaming' configs.
         """
+        logging.debug(f"Initializing CanarySpeechEngine with config: {cfg}")
         self.cfg = cfg.toOmegaConf() # Store the full config
         # Setup device and dtype from config

app/ui_utils.py CHANGED Viewed

@@ -26,10 +26,10 @@ EXAMPLE_CONFIGS = {
         "exclude_sink_frames": 8, "xatt_scores_layer": -2, "hallucinations_detector": True
     },
     "data/french_news.wav": {
-        "task_type": "Transcription", "lang_source": "French", "lang_target": "English",
-        "chunk_secs": 1.0, "left_context_secs": 15.0, "right_context_secs": 0.3,
-        "streaming_policy": "alignatt", "alignatt_thr": 10, "waitk_lagging": 3,
-        "exclude_sink_frames": 6, "xatt_scores_layer": -1, "hallucinations_detector": True
     },
     "data/spanish_podcast.wav": {
         "task_type": "Translation", "lang_source": "Spanish", "lang_target": "English",

         "exclude_sink_frames": 8, "xatt_scores_layer": -2, "hallucinations_detector": True
     },
     "data/french_news.wav": {
+        "task_type": "Transcription", "lang_source": "French", "lang_target": "French",
+        "chunk_secs": 1.0, "left_context_secs": 15.0, "right_context_secs": 0.5,
+        "streaming_policy": "alignatt", "alignatt_thr": 8.0, "waitk_lagging": 3,
+        "exclude_sink_frames": 8, "xatt_scores_layer": -2, "hallucinations_detector": True
     },
     "data/spanish_podcast.wav": {
         "task_type": "Translation", "lang_source": "Spanish", "lang_target": "English",

app/utils.py CHANGED Viewed

@@ -18,7 +18,14 @@ from app.session_utils import (
     get_active_task_flag_file,
     get_folder_chunks
 )
 # --------------------------------------------------------
 # Utility functions
@@ -120,6 +127,216 @@ def read_and_stream_audio(filepath_to_stream: str, session_id: str, stop_streami
 def handle_stream_error(session_id: str, error: Exception | str, stop_streaming_flags: dict | None = None):
     """
     Handle streaming errors:

     get_active_task_flag_file,
     get_folder_chunks
 )
+from app.ui_utils import (
+    SUPPORTED_LANGS_MAP
+)
+from app.canary_speech_engine import CanarySpeechEngine,CanaryConfig
+from app.silero_vad_engine import Silero_Vad_Engine
+from app.streaming_audio_processor import StreamingAudioProcessor,StreamingAudioProcessorConfig
+import nemo.collections.asr as nemo_asr
+READ_SIZE=4000
 # --------------------------------------------------------
 # Utility functions
+# asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/canary-1b-v2")
+asr_model = None
+@spaces.GPU
+def task_fake(session_id: str,
+        task_type, lang_source, lang_target,
+        chunk_secs, left_context_secs, right_context_secs,
+        streaming_policy, alignatt_thr, waitk_lagging,
+        exclude_sink_frames, xatt_scores_layer, hallucinations_detector
+          ):
+    """Continuously read and delete .npz chunks while task is active."""
+    global asr_model
+    yield ("initializing the CanarySpeechEngine and Silero_Vad_Engine", "info", None)
+    ### TODO
+    ##-----------
+    # conf = CanaryConfig.from_params(
+    #     task_type, SUPPORTED_LANGS_MAP.get(lang_source),SUPPORTED_LANGS_MAP.get(lang_target) ,
+    #     chunk_secs, left_context_secs, right_context_secs,
+    #     streaming_policy, alignatt_thr, waitk_lagging,
+    #     exclude_sink_frames, xatt_scores_layer, hallucinations_detector
+    # )
+    # canary_speech_engine = CanarySpeechEngine(asr_model,conf)
+    # silero_vad_engine = Silero_Vad_Engine()
+    # streaming_audio_processor_config = StreamingAudioProcessorConfig(
+    # read_size=READ_SIZE,
+    # silence_threshold_chunks=1
+    # )
+    # streamer = StreamingAudioProcessor(speech_engine=canary_speech_engine,vad_engine=silero_vad_engine,cfg=streaming_audio_processor_config)
+    ##-----------
+    yield ("initialized the CanarySpeechEngine and Silero_Vad_Engine", "info", None)
+    yield (f"Task started for session {session_id}", "info", None)
+    active_flag = get_active_task_flag_file(session_id)
+    with open(active_flag, "w") as f:
+        f.write("1")
+    chunk_dir = get_folder_chunks(session_id)
+    logging.info(f"[{session_id}] task started. {chunk_dir}")
+    try:
+        logging.info(f"[{session_id}] task loop started.")
+        yield (f"Task started for session {session_id}", "info", None)
+        while os.path.exists(active_flag):
+            if not os.path.exists(chunk_dir):
+                logging.warning(f"[{session_id}] No chunk directory found for task.")
+                yield ("No audio chunks yet... waiting for stream.", "warning", None)
+                time.sleep(0.1)
+                continue
+            files = sorted(f for f in os.listdir(chunk_dir) if f.endswith(".npz"))
+            if not files:
+                time.sleep(0.1)
+                continue
+            for fname in files:
+                fpath = os.path.join(chunk_dir, fname)
+                try:
+                    npz = np.load(fpath)
+                    samples = npz["data"]
+                    rate = int(npz["rate"])
+                    ##-----------
+                    # new_texts = streamer.process_chunk(samples)
+                    # for text in new_texts:
+                    #     print(text, end='', flush=True)
+                    #     yield (text, "success", text)
+                    #     logging.debug(f"[{session_id}] {new_texts}")
+                    ##-----------
+                    ### TODO
+                    text = f"Transcribed {fname}: {len(samples)} samples @ {rate}Hz\n"
+                    yield (text, "success", fname)
+                    os.remove(fpath)
+                    logging.debug(f"[{session_id}] Deleted processed chunk: {fname}")
+                except Exception as e:
+                    logging.warning(f"[{session_id}] Error processing {fname}: {e}")
+                    yield (f"Error processing {fname}: {e}", "warning", fname)
+                    continue
+                time.sleep(0.1)
+        # TODO
+        ##-----------
+        # final_text = streamer.finalize_stream()
+        # yield (text, "success", final_text)
+        ##-----------
+        yield ("DONE", "done", None)
+        logging.info(f"[{session_id}] task loop ended (flag removed).")
+    except Exception as e:
+        logging.error(f"[{session_id}] task error: {e}", exc_info=True)
+        yield (f"Unexpected error: {e}", "error", None)
+    finally:
+        if os.path.exists(active_flag):
+            os.remove(active_flag)
+        logging.info(f"[{session_id}] task stopped.")
+        try:
+            if os.path.exists(chunk_dir) and not os.listdir(chunk_dir):
+                os.rmdir(chunk_dir)
+                logging.debug(f"[{session_id}] Cleaned up empty chunk dir.")
+        except Exception as e:
+            logging.error(f"[{session_id}] Cleanup error: {e}")
+            yield (f"Cleanup error: {e}", "error", None)
+        logging.info(f"[{session_id}] Exiting task loop.")
+        yield ("Task finished and cleaned up.", "done", None)
+def task(session_id: str,
+        task_type, lang_source, lang_target,
+        chunk_secs, left_context_secs, right_context_secs,
+        streaming_policy, alignatt_thr, waitk_lagging,
+        exclude_sink_frames, xatt_scores_layer, hallucinations_detector
+          ):
+    """Continuously read and delete .npz chunks while task is active."""
+    global asr_model
+    yield ("initializing the CanarySpeechEngine and Silero_Vad_Engine", "info", None)
+    conf = CanaryConfig.from_params(
+        task_type, SUPPORTED_LANGS_MAP.get(lang_source),SUPPORTED_LANGS_MAP.get(lang_target) ,
+        chunk_secs, left_context_secs, right_context_secs,
+        streaming_policy, alignatt_thr, waitk_lagging,
+        exclude_sink_frames, xatt_scores_layer, hallucinations_detector
+    )
+    canary_speech_engine = CanarySpeechEngine(asr_model,conf)
+    silero_vad_engine = Silero_Vad_Engine()
+    streaming_audio_processor_config = StreamingAudioProcessorConfig(
+    read_size=READ_SIZE,
+    silence_threshold_chunks=1
+    )
+    streamer = StreamingAudioProcessor(speech_engine=canary_speech_engine,vad_engine=silero_vad_engine,cfg=streaming_audio_processor_config)
+    yield ("initialized the CanarySpeechEngine and Silero_Vad_Engine", "info", None)
+    yield (f"Task started for session {session_id}", "info", None)
+    active_flag = get_active_task_flag_file(session_id)
+    with open(active_flag, "w") as f:
+        f.write("1")
+    chunk_dir = get_folder_chunks(session_id)
+    logging.info(f"[{session_id}] task started. {chunk_dir}")
+    try:
+        logging.info(f"[{session_id}] task loop started.")
+        yield (f"Task started for session {session_id}", "info", None)
+        while os.path.exists(active_flag):
+            if not os.path.exists(chunk_dir):
+                logging.warning(f"[{session_id}] No chunk directory found for task.")
+                yield ("No audio chunks yet... waiting for stream.", "warning", None)
+                time.sleep(0.1)
+                continue
+            files = sorted(f for f in os.listdir(chunk_dir) if f.endswith(".npz"))
+            if not files:
+                time.sleep(0.1)
+                continue
+            for fname in files:
+                fpath = os.path.join(chunk_dir, fname)
+                try:
+                    npz = np.load(fpath)
+                    samples = npz["data"]
+                    rate = int(npz["rate"])
+                    new_texts = streamer.process_chunk(samples)
+                    for text in new_texts:
+                        print(text, end='', flush=True)
+                        yield (text, "success", text)
+                        logging.debug(f"[{session_id}] {new_texts}")
+                    ### TODO
+                    # text = f"Transcribed {fname}: {len(samples)} samples @ {rate}Hz\n"
+                    # yield (text, "success", fname)
+                    os.remove(fpath)
+                    logging.debug(f"[{session_id}] Deleted processed chunk: {fname}")
+                except Exception as e:
+                    logging.warning(f"[{session_id}] Error processing {fname}: {e}")
+                    yield (f"Error processing {fname}: {e}", "warning", fname)
+                    continue
+                time.sleep(0.1)
+        # TODO
+        final_text = streamer.finalize_stream()
+        yield (text, "success", final_text)
+        # if final_text:
+        #     print(final_text, end='', flush=True)
+        # yield f"\n{final_text}"
+        ##
+        yield ("DONE", "done", None)
+        logging.info(f"[{session_id}] task loop ended (flag removed).")
+    except Exception as e:
+        logging.error(f"[{session_id}] task error: {e}", exc_info=True)
+        yield (f"Unexpected error: {e}", "error", None)
+    finally:
+        if os.path.exists(active_flag):
+            os.remove(active_flag)
+        logging.info(f"[{session_id}] task stopped.")
+        try:
+            if os.path.exists(chunk_dir) and not os.listdir(chunk_dir):
+                os.rmdir(chunk_dir)
+                logging.debug(f"[{session_id}] Cleaned up empty chunk dir.")
+        except Exception as e:
+            logging.error(f"[{session_id}] Cleanup error: {e}")
+            yield (f"Cleanup error: {e}", "error", None)
+        logging.info(f"[{session_id}] Exiting task loop.")
+        yield ("Task finished and cleaned up.", "done", None)
 def handle_stream_error(session_id: str, error: Exception | str, stop_streaming_flags: dict | None = None):
     """
     Handle streaming errors:

assets/custom_style.css CHANGED Viewed

@@ -144,4 +144,55 @@ body {
   padding: 0.75rem;
   color: #991B1B;
   font-weight: 500;
 }

   padding: 0.75rem;
   color: #991B1B;
   font-weight: 500;
+}
+#status-message-task {
+    padding: 0.75rem;
+    border-radius: 8px; /* Coins arrondis */
+    margin-top: 10px;
+    font-weight: 500; /* Un peu plus gras que la normale */
+    border: 1px solid transparent;
+    display: none; /* Caché par défaut */
+}
+/* Le style .info (bleu) */
+#status-message-task.info{
+    color: #0c5464; /* Texte bleu foncé */
+    background-color: #d1ecf1; /* Fond bleu clair */
+    border-color: #bee5eb; /* Bordure bleue */
+    display: block; /* Le rend visible */
+}
+/* Le style .warning (jaune/orange) */
+#status-message-task.warning {
+    color: #856404; /* Texte ocre */
+    background-color: #fff3cd; /* Fond jaune clair */
+    border-color: #ffeeba; /* Bordure jaune */
+    display: block; /* Le rend visible */
+}
+/* Le style .error (rouge) */
+#status-message-task.error {
+    color: #721c24; /* Texte rouge foncé */
+    background-color: #f8d7da; /* Fond rouge clair */
+    border-color: #f5c6cb; /* Bordure rouge */
+    display: block; /* Le rend visible */
+}
+/* Styles personnalisés pour le WebRTC */
+#webcam-stream {
+    border: 2px solid #007bff;
+    border-radius: 10px;
+    box-shadow: 0 4px 8px rgba(0,0,0,0.1);
+    background-color: #f8f9fa;
+    margin: 10px 0;
+}
+#webcam-stream .gr-webRTC {
+    background-color: #e9ecef;
+}
+#task-output-box textarea {
+    font-size: 1.15em; /* 'Moyenne taille' - ajustez au besoin */
+    font-weight: bold;  /* 'En gras' */
 }