Spaces:

kasimali
/

new-asr-voxlingua

Running

App Files Files Community

kasimali commited on Oct 8

Commit

aa5b88b

verified ·

1 Parent(s): e7e41d4

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +3 -6
app.py +484 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,10 +1,7 @@
 ---
-title: New Asr Voxlingua
-emoji: 🌍
-colorFrom: yellow
-colorTo: yellow
 sdk: static
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: NEW-ASR-VOXLINGUA
+emoji: 🚀
 sdk: static
 ---
+# NEW-ASR-VOXLINGUA

app.py ADDED Viewed

	@@ -0,0 +1,484 @@

+# NEW-ASR-VOXLINGUA
+# ==============================================================================
+# Cell 1: Environment Setup & Dependencies
+#
+# CORRECTED: Forcing SpeechBrain to version 0.5.16 to ensure backward
+# compatibility with the old TalTechNLP XLS-R model.
+# ==============================================================================
+print("CELL 1: Setting up the environment with specific SpeechBrain version...")
+# --- CORE CORRECTION ---
+# Uninstall any existing newer versions and install the last stable version (0.5.x)
+# that is compatible with the old TalTechNLP model's file paths.
+# --- END CORRECTION ---
+import torch
+print("\n--- System Check ---")
+if torch.cuda.is_available():
+    print(f"✅ GPU found: {torch.cuda.get_device_name(0)}")
+    print(f"   CUDA Version: {torch.version.cuda}")
+else:
+    print("⚠️ GPU not found. Using CPU. This will be significantly slower.")
+print("--- End System Check ---\n")
+pip show speechbrain.inference
+print("CELL 2: Importing libraries and setting up language maps...")
+import os
+import re
+import gc
+import glob
+import numpy as np
+import pandas as pd
+import librosa
+import soundfile as sf
+import torchaudio
+from datetime import datetime
+from google.colab import files
+import subprocess
+import shutil
+# Transformers and ML libraries
+from transformers import AutoModel, Wav2Vec2Processor, Wav2Vec2ForCTC
+from speechbrain.inference.classifiers import EncoderClassifier
+from speechbrain.pretrained.interfaces import foreign_class
+from tokenizers import Tokenizer, models, trainers, pre_tokenizers
+import warnings
+warnings.filterwarnings('ignore')
+# Complete language mappings as sets for O(1) lookup
+INDO_ARYAN_LANGS = {'hi', 'bn', 'mr', 'gu', 'pa', 'or', 'as', 'ur', 'ks', 'sd', 'ne', 'kok'}
+DRAVIDIAN_LANGS = {'ta', 'te', 'kn', 'ml'}
+LOW_RESOURCE_LANGS = {'brx', 'mni', 'sat', 'doi'}
+# Research-verified cross-lingual transfer mapping
+TRANSFER_MAPPING = {'brx': 'hi', 'sat': 'hi', 'doi': 'pa', 'mni': 'bn'}
+ALL_SUPPORTED_LANGS = INDO_ARYAN_LANGS | DRAVIDIAN_LANGS | LOW_RESOURCE_LANGS
+print(f"✅ Libraries imported successfully.")
+print(f"📊 Total languages supported: {len(ALL_SUPPORTED_LANGS)}\n")
+print("CELL 3: Defining audio preprocessing functions...")
+SUPPORTED_FORMATS = {'.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac'}
+def validate_audio_format(audio_path):
+    ext = os.path.splitext(audio_path)[1].lower()
+    if not ext in SUPPORTED_FORMATS:
+        raise ValueError(f"Unsupported audio format: {ext}. Supported: {SUPPORTED_FORMATS}")
+    return True
+def preprocess_audio(audio_path, target_sr=16000):
+    validate_audio_format(audio_path)
+    try:
+        waveform, sr = torchaudio.load(audio_path)
+    except Exception:
+        waveform, sr = librosa.load(audio_path, sr=None)
+        waveform = torch.tensor(waveform).unsqueeze(0)
+    if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True)
+    if sr != target_sr:
+        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
+        waveform = resampler(waveform)
+    return waveform, target_sr
+print("✅ Audio preprocessing functions ready.\n")
+print("CELL 4: Defining file handling functions...")
+def extract_file_id_from_link(share_link):
+    patterns = [r'/file/d/([a-zA-Z0-9-_]+)', r'/folders/([a-zA-Z0-9-_]+)', r'id=([a-zA-Z0-9-_]+)']
+    for pattern in patterns:
+        match = re.search(pattern, share_link)
+        if match: return match.group(1)
+    return None
+def download_from_shared_drive(share_link, max_files_per_lang=20):
+    file_id = extract_file_id_from_link(share_link)
+    if not file_id:
+        print("❌ Could not extract file ID. Please check your sharing link.")
+        return []
+    download_dir = "/content/shared_dataset"
+    if os.path.exists(download_dir): shutil.rmtree(download_dir)
+    os.makedirs(download_dir, exist_ok=True)
+    print(f"✅ Extracted ID: {file_id}. Starting download...")
+    try:
+        import gdown
+        gdown.download_folder(f"https://drive.google.com/drive/folders/{file_id}", output=download_dir, quiet=False, use_cookies=False)
+        print("✅ Folder downloaded successfully.")
+    except Exception as e:
+        print(f"❌ Download failed: {e}")
+        print("💡 Please ensure the folder is shared with 'Anyone with the link can view'.")
+        return []
+    print("\n🔍 Scanning for audio files...")
+    all_audio_files = [p for ext in SUPPORTED_FORMATS for p in glob.glob(os.path.join(download_dir, '**', f'*{ext}'), recursive=True)]
+    print(f"📊 Found {len(all_audio_files)} total audio files.")
+    lang_folders = {d: [] for d in os.listdir(download_dir) if os.path.isdir(os.path.join(download_dir, d))}
+    for f in all_audio_files:
+        lang_code = os.path.basename(os.path.dirname(f))
+        if lang_code in lang_folders: lang_folders[lang_code].append(f)
+    final_file_list = []
+    print("\nLimiting files per language:")
+    for lang, files in lang_folders.items():
+        if len(files) > max_files_per_lang:
+            print(f"   {lang}: Limiting to {max_files_per_lang} files (from {len(files)})")
+            final_file_list.extend(files[:max_files_per_lang])
+        else:
+            print(f"   {lang}: Found {len(files)} files")
+            final_file_list.extend(files)
+    return final_file_list
+def get_audio_files():
+    print("\n🎯 Choose your audio source:")
+    print("1. Upload files from computer")
+    print("2. Download from Google Drive sharing link")
+    choice = input("Enter choice (1/2): ").strip()
+    if choice == '1':
+        uploaded = files.upload()
+        return [f"/content/{fname}" for fname in uploaded.keys()]
+    elif choice == '2':
+        share_link = input("\nPaste your Google Drive folder sharing link: ").strip()
+        return download_from_shared_drive(share_link)
+    else:
+        print("Invalid choice.")
+        return []
+print("✅ File handling functions ready.\n")
+print("CELL 5: Loading Language Identification (LID) Models...")
+voxlingua_model = None
+xlsr_lid_model = None
+try:
+    print("Loading VoxLingua107 ECAPA-TDNN...")
+    voxlingua_model = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="pretrained_models/voxlingua107")
+    print("✅ VoxLingua107 loaded.")
+except Exception as e:
+    print(f"❌ VoxLingua107 error: {e}")
+try:
+    print("\nLoading TalTechNLP XLS-R LID...")
+    xlsr_lid_model = foreign_class(source="TalTechNLP/voxlingua107-xls-r-300m-wav2vec", pymodule_file="encoder_wav2vec_classifier.py", classname="EncoderWav2vecClassifier", hparams_file="inference_wav2vec.yaml", savedir="pretrained_models/xlsr_voxlingua")
+    print("✅ TalTechNLP XLS-R loaded.")
+except Exception as e:
+    print(f"❌ XLS-R error: {e}. Pipeline will proceed with primary LID model only.")
+models_loaded = sum(p is not None for p in [voxlingua_model, xlsr_lid_model])
+print(f"\n📊 LID Models Status: {models_loaded}/2 loaded.\n")
+print("CELL 6: Defining hybrid language detection system...")
+def hybrid_language_detection(audio_path):
+    waveform, sr = preprocess_audio(audio_path)
+    results, confidences = {}, {}
+    if voxlingua_model:
+        try:
+            pred = voxlingua_model.classify_file(audio_path)
+            lang_code = str(pred[3][0]).split(':')[0].strip()
+            confidence = float(pred[1].exp().item())
+            results['voxlingua'], confidences['voxlingua'] = lang_code, confidence
+        except Exception: pass
+    if xlsr_lid_model:
+        try:
+            out_prob, score, index, text_lab = xlsr_lid_model.classify_file(audio_path)
+            lang_code = str(text_lab[0]).strip().lower()
+            confidence = float(out_prob.exp().max().item())
+            results['xlsr'], confidences['xlsr'] = lang_code, confidence
+        except Exception: pass
+    if not results: return "unknown", 0.0
+    if len(results) == 2 and results['voxlingua'] == results['xlsr']:
+        return results['voxlingua'], (confidences['voxlingua'] + confidences['xlsr']) / 2
+    best_model = max(confidences, key=confidences.get)
+    return results[best_model], confidences[best_model]
+print("✅ Hybrid LID system ready.\n")
+print("CELL 7: Loading Automatic Speech Recognition (ASR) Models...")
+indicconformer_model = None
+indicwav2vec_processor = None
+indicwav2vec_model = None
+try:
+    print("Loading IndicConformer for Indo-Aryan...")
+    indicconformer_model = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True)
+    print("✅ IndicConformer loaded.")
+except Exception as e:
+    print(f"❌ IndicConformer Error: {e}. Indo-Aryan transcription will be unavailable.")
+# Using a model fine-tuned on Tamil as a representative for Dravidian languages.
+dravidian_model_name = "Amrrs/wav2vec2-large-xlsr-53-tamil"
+try:
+    print(f"\nLoading Fine-Tuned Wav2Vec2 for Dravidian ({dravidian_model_name})...")
+    indicwav2vec_processor = Wav2Vec2Processor.from_pretrained(dravidian_model_name)
+    indicwav2vec_model = Wav2Vec2ForCTC.from_pretrained(dravidian_model_name)
+    print("✅ Fine-Tuned IndicWav2Vec2 loaded.")
+except Exception as e:
+    print(f"❌ IndicWav2Vec2 Error: {e}. Dravidian transcription will be unavailable.")
+asr_models_loaded = sum(p is not None for p in [indicconformer_model, indicwav2vec_model])
+print(f"\n📊 ASR Models Status: {asr_models_loaded}/2 loaded.\n")
+# ==============================================================================
+# Cell 8: BPE and Syllable-BPE Tokenization Classes
+#
+# This version correctly handles untrained tokenizers and has improved
+# regex for more accurate syllable segmentation.
+# ==============================================================================
+print("CELL 8: Defining tokenization classes...")
+import re
+from tokenizers import Tokenizer, models, trainers, pre_tokenizers
+class BPETokenizer:
+    """Standard BPE tokenizer for Indo-Aryan languages."""
+    def __init__(self, vocab_size=5000):
+        self.tokenizer = Tokenizer(models.BPE())
+        self.tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
+        self.trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["<unk>", "<pad>"])
+        self.trained = False
+    def train(self, texts):
+        """Train BPE tokenizer on a text corpus."""
+        self.tokenizer.train_from_iterator(texts, self.trainer)
+        self.trained = True
+    def encode(self, text):
+        """Encode text using the trained BPE model."""
+        if not self.trained:
+            # Fallback for untrained tokenizer
+            return text.split()
+        return self.tokenizer.encode(text).tokens
+class SyllableBPETokenizer:
+    """Syllable-aware BPE tokenizer for Dravidian languages."""
+    def __init__(self, vocab_size=3000):
+        self.vocab_size = vocab_size
+        self.patterns = {
+            'ta': r'[க-ஹ][ா-ௌ]?|[அ-ஔ]',  # Tamil
+            'te': r'[క-హ][ా-ౌ]?|[అ-ఔ]',  # Telugu
+            'kn': r'[ಕ-ಹ][ಾ-ೌ]?|[ಅ-ಔ]',  # Kannada
+            'ml': r'[ക-ഹ][ാ-ൌ]?|[അ-ഔ]'   # Malayalam
+        }
+        self.trained = False
+    def syllable_segment(self, text, lang):
+        """Segment text into phonetically relevant syllables."""
+        pattern = self.patterns.get(lang, r'\S+')  # Fallback to whitespace for other languages
+        syllables = re.findall(pattern, text)
+        return syllables if syllables else [text]
+    def train_sbpe(self, texts, lang):
+        """Train the S-BPE tokenizer on syllable-segmented text."""
+        syllable_texts = [' '.join(self.syllable_segment(t, lang)) for t in texts]
+        self.tokenizer = Tokenizer(models.BPE())
+        trainer = trainers.BpeTrainer(vocab_size=self.vocab_size, special_tokens=["<unk>", "<pad>"])
+        self.tokenizer.train_from_iterator(syllable_texts, trainer)
+        self.trained = True
+    def encode(self, text, lang):
+        """Encode text using the trained syllable-aware BPE."""
+        syllables = self.syllable_segment(text, lang)
+        if not self.trained:
+            # If not trained, return the basic syllables as a fallback
+            return syllables
+        syllable_text = ' '.join(syllables)
+        return self.tokenizer.encode(syllable_text).tokens
+print("✅ BPE and S-BPE tokenization classes implemented and verified.\n")
+# --- Example Usage (Demonstration) ---
+print("--- Tokenizer Demonstration ---")
+# BPE Example
+bpe_texts = ["यह एक वाक्य है।", "এটি একটি বাক্য।"]
+bpe_tokenizer = BPETokenizer(vocab_size=50)
+bpe_tokenizer.train(bpe_texts)
+print(f"BPE Tokens: {bpe_tokenizer.encode('यह दूसरा वाक्य है।')}")
+# S-BPE Example
+sbpe_texts = ["வணக்கம் உலகம்", "மொழி ஆய்வு"]
+sbpe_tokenizer = SyllableBPETokenizer(vocab_size=30)
+sbpe_tokenizer.train_sbpe(sbpe_texts, 'ta')
+print(f"S-BPE Tokens (Tamil): {sbpe_tokenizer.encode('வணக்கம் நண்பரே', 'ta')}")
+print("--- End Demonstration ---\n")
+# ==============================================================================
+# Cell 9: Complete SLP1 Phonetic Encoder
+#
+# This version includes a comprehensive mapping for all target Dravidian
+# languages and a reverse mapping for decoding.
+# ==============================================================================
+print("CELL 9: Defining the SLP1 phonetic encoder...")
+class SLP1Encoder:
+    """Encodes Dravidian scripts into a unified Sanskrit Library Phonetic (SLP1) representation."""
+    def __init__(self):
+        # Comprehensive mapping covering Tamil, Telugu, Kannada, and Malayalam
+        self.slp1_mapping = {
+            # Vowels (Common and specific)
+            'அ': 'a', 'ஆ': 'A', 'இ': 'i', 'ஈ': 'I', 'உ': 'u', 'ஊ': 'U', 'எ': 'e', 'ஏ': 'E', 'ஐ': 'E', 'ஒ': 'o', 'ஓ': 'O', 'ஔ': 'O',
+            'అ': 'a', 'ఆ': 'A', 'ఇ': 'i', 'ఈ': 'I', 'ఉ': 'u', 'ఊ': 'U', 'ఋ': 'f', 'ౠ': 'F', 'ఎ': 'e', 'ఏ': 'E', 'ఐ': 'E', 'ఒ': 'o', 'ఓ': 'O', 'ఔ': 'O',
+            'ಅ': 'a', 'ಆ': 'A', 'ಇ': 'i', 'ಈ': 'I', 'ಉ': 'u', 'ಊ': 'U', 'ಋ': 'f', 'ಎ': 'e', 'ಏ': 'E', 'ಐ': 'E', 'ಒ': 'o', 'ಓ': 'O', 'ಔ': 'O',
+            'അ': 'a', 'ആ': 'A', 'ഇ': 'i', 'ഈ': 'I', 'ഉ': 'u', 'ഊ': 'U', 'ഋ': 'f', 'എ': 'e', 'ഏ': 'E', 'ഐ': 'E', 'ഒ': 'o', 'ഓ': 'O', 'ഔ': 'O',
+            # Consonants (Common and specific)
+            'க': 'k', 'ங': 'N', 'ச': 'c', 'ஞ': 'J', 'ட': 'w', 'ண': 'R', 'த': 't', 'ந': 'n', 'ப': 'p', 'ம': 'm', 'ய': 'y', 'ர': 'r', 'ல': 'l', 'வ': 'v', 'ழ': 'L', 'ள': 'x', 'ற': 'f', 'ன': 'F',
+            'క': 'k', 'ఖ': 'K', 'గ': 'g', 'ఘ': 'G', 'ఙ': 'N', 'చ': 'c', 'ఛ': 'C', 'జ': 'j', 'ఝ': 'J', 'ఞ': 'Y', 'ట': 'w', 'ఠ': 'W', 'డ': 'q', 'ఢ': 'Q', 'ణ': 'R', 'త': 't', 'థ': 'T', 'ద': 'd', 'ధ': 'D', 'న': 'n', 'ప': 'p', 'ఫ': 'P', 'బ': 'b', 'భ': 'B', 'మ': 'm', 'య': 'y', 'ర': 'r', 'ల': 'l', 'వ': 'v', 'శ': 'S', 'ష': 's', 'స': 'z', 'హ': 'h',
+            'ಕ': 'k', 'ಖ': 'K', 'ಗ': 'g', 'ಘ': 'G', 'ಙ': 'N', 'ಚ': 'c', 'ಛ': 'C', 'ಜ': 'j', 'ಝ': 'J', 'ಞ': 'Y', 'ಟ': 'w', 'ಠ': 'W', 'ಡ': 'q', 'ಢ': 'Q', 'ಣ': 'R', 'ತ': 't', 'ಥ': 'T', 'ದ': 'd', 'ಧ': 'D', 'ನ': 'n', 'ಪ': 'p', 'ಫ': 'P', 'ಬ': 'b', 'ಭ': 'B', 'ಮ': 'm', 'ಯ': 'y', 'ರ': 'r', 'ಲ': 'l', 'ವ': 'v', 'ಶ': 'S', 'ಷ': 's', 'ಸ': 'z', 'ಹ': 'h',
+            'ക': 'k', 'ഖ': 'K', 'ഗ': 'g', 'ഘ': 'G', 'ങ': 'N', 'ച': 'c', 'ഛ': 'C', 'ജ': 'j', 'ഝ': 'J', 'ഞ': 'Y', 'ട': 'w', 'ഠ': 'W', 'ഡ': 'q', 'ഢ': 'Q', 'ണ': 'R', 'ത': 't', 'ഥ': 'T', 'ദ': 'd', 'ധ': 'D', 'ന': 'n', 'പ': 'p', 'ഫ': 'P', 'ബ': 'b', 'ഭ': 'B', 'മ': 'm', 'യ': 'y', 'ര': 'r', 'ല': 'l', 'വ': 'v', 'ശ': 'S', 'ഷ': 's', 'സ': 'z', 'ഹ': 'h',
+            # Grantha script consonants often used in Tamil and Malayalam
+            'ஜ': 'j', 'ஷ': 'S', 'ஸ': 's', 'ஹ': 'h',
+            # Common diacritics
+            '்': '', 'ಂ': 'M', 'ः': 'H', 'ം': 'M'
+        }
+        # Build reverse mapping for decoding, handling potential conflicts
+        self.reverse_mapping = {v: k for k, v in self.slp1_mapping.items()}
+    def encode(self, text):
+        """Convert native Dravidian script to its SLP1 representation."""
+        if not text:
+            return ""
+        return "".join([self.slp1_mapping.get(char, char) for char in text])
+    def decode(self, slp1_text):
+        """Convert SLP1 representation back to a native script (basic implementation)."""
+        if not slp1_text:
+            return ""
+        return "".join([self.reverse_mapping.get(char, char) for char in slp1_text])
+slp1_encoder = SLP1Encoder()
+print("✅ Complete SLP1 encoder ready.")
+print(f"🔤 Total character mappings: {len(slp1_encoder.slp1_mapping)}\n")
+# --- Example Usage (Demonstration) ---
+print("--- SLP1 Encoder Demonstration ---")
+test_cases = [
+    ("கல்வி", "Tamil"),
+    ("విద్య", "Telugu"),
+    ("ಶಿಕ್ಷಣ", "Kannada"),
+    ("വിദ്യാഭ്യാസം", "Malayalam")
+]
+for text, lang in test_cases:
+    encoded = slp1_encoder.encode(text)
+    print(f"   {lang}: {text} → {encoded}")
+print("--- End Demonstration ---\n")
+print("CELL 10: Defining family-specific ASR processing functions...")
+def process_indo_aryan_asr(audio_path, detected_lang):
+    if indicconformer_model is None: return "[IndicConformer model not loaded]"
+    try:
+        waveform, sr = preprocess_audio(audio_path)
+        # The model expects language code and decoding strategy ("ctc" or "rnnt")
+        transcription = indicconformer_model(waveform, detected_lang, "ctc")[0]
+        return transcription
+    except Exception as e: return f"Error in Indo-Aryan ASR: {e}"
+def process_dravidian_asr(audio_path, detected_lang):
+    if not (indicwav2vec_model and indicwav2vec_processor): return "[Dravidian ASR model not loaded]", ""
+    try:
+        waveform, sr = preprocess_audio(audio_path)
+        input_values = indicwav2vec_processor(waveform.squeeze().numpy(), sampling_rate=sr, return_tensors="pt").input_values
+        with torch.no_grad(): logits = indicwav2vec_model(input_values).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = indicwav2vec_processor.batch_decode(predicted_ids)[0]
+        # S-BPE Tokenization for analysis
+        sbpe_tokenizer = SyllableBPETokenizer()
+        sbpe_tokenizer.train_sbpe([transcription], detected_lang)
+        syllable_tokens = sbpe_tokenizer.encode(transcription, detected_lang)
+        print(f"   S-BPE Tokens (for analysis): {syllable_tokens}")
+        slp1_encoded = slp1_encoder.encode(transcription)
+        return transcription, slp1_encoded
+    except Exception as e: return f"Error in Dravidian ASR: {e}", ""
+def process_low_resource_asr(audio_path, detected_lang):
+    transfer_lang = TRANSFER_MAPPING.get(detected_lang, 'hi')
+    print(f"   Using transfer learning: {detected_lang} -> {transfer_lang}")
+    return process_indo_aryan_asr(audio_path, transfer_lang)
+print("✅ Family-specific ASR functions ready.\n")
+print("CELL 11: Defining the main processing pipeline...")
+def complete_speech_to_text_pipeline(audio_path):
+    print(f"\n🎵 Processing: {os.path.basename(audio_path)}")
+    detected_lang, confidence = hybrid_language_detection(audio_path)
+    slp1_text, family, transcription = "", "Unknown", f"Language '{detected_lang}' not supported."
+    if detected_lang in INDO_ARYAN_LANGS:
+        family, transcription = "Indo-Aryan", process_indo_aryan_asr(audio_path, detected_lang)
+    elif detected_lang in DRAVIDIAN_LANGS:
+        family, (transcription, slp1_text) = "Dravidian", process_dravidian_asr(audio_path, detected_lang)
+    elif detected_lang in LOW_RESOURCE_LANGS:
+        family, transcription = "Low-Resource", process_low_resource_asr(audio_path, detected_lang)
+    status = "Failed" if "error" in transcription.lower() or "not supported" in transcription.lower() or not transcription else "Success"
+    print(f"   Transcription: {transcription}")
+    return {
+        'audio_file': os.path.basename(audio_path),
+        'full_path': audio_path,
+        'detected_language': detected_lang,
+        'language_family': family, 'confidence': round(confidence, 3), 'transcription': transcription,
+        'slp1_encoding': slp1_text, 'status': status, 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    }
+def batch_process_audio_files(audio_files):
+    if not audio_files:
+        print("❌ No audio files to process.")
+        return []
+    results = [complete_speech_to_text_pipeline(f) for f in audio_files]
+    success_count = sum(1 for r in results if r['status'] == 'Success')
+    success_rate = (success_count / len(results)) * 100 if results else 0
+    print(f"\n🎉 Batch processing completed! Success rate: {success_rate:.1f}% ({success_count}/{len(results)})")
+    return results
+print("✅ Main pipeline ready.\n")
+print("CELL 12: Defining report generation and main execution logic...")
+def generate_excel_report(results):
+    if not results: return None
+    df = pd.DataFrame(results)
+    def get_ground_truth(path):
+        parts = path.split('/')
+        for part in reversed(parts):
+            if len(part) == 2 and part.isalpha() and part in ALL_SUPPORTED_LANGS: return part
+        return "unknown"
+    df['ground_truth'] = df['full_path'].apply(get_ground_truth)
+    df['is_correct'] = df.apply(lambda row: row['detected_language'] == row['ground_truth'], axis=1)
+    filename = f"ASR_Evaluation_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
+    with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
+        df.to_excel(writer, sheet_name='Detailed_Results', index=False)
+        # Summary Sheet
+        summary_data = {
+            'Metric': ['Total Files', 'Successful Transcriptions', 'Overall LID Accuracy'],
+            'Value': [len(df), df['status'].eq('Success').sum(), f"{df['is_correct'].mean()*100:.2f}%"]
+        }
+        pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
+    print(f"\n✅ Comprehensive Excel report generated: {filename}")
+    except Exception as e: print(f"   Could not auto-download file: {e}")
+    return filename
+# --- MAIN EXECUTION ---
+print("\n🚀🚀🚀 Starting the Full ASR Pipeline 🚀🚀🚀")
+audio_files_to_process = get_audio_files()
+if audio_files_to_process:
+    pipeline_results = batch_process_audio_files(audio_files_to_process)
+    generate_excel_report(pipeline_results)
+else:
+    print("\nNo audio files were selected. Exiting.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+datasets
+numpy
+pandas
+sentencepiece
+torch
+transformers