kasimali commited on
Commit
aa5b88b
·
verified ·
1 Parent(s): e7e41d4

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +3 -6
  2. app.py +484 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,10 +1,7 @@
1
  ---
2
- title: New Asr Voxlingua
3
- emoji: 🌍
4
- colorFrom: yellow
5
- colorTo: yellow
6
  sdk: static
7
- pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: NEW-ASR-VOXLINGUA
3
+ emoji: 🚀
 
 
4
  sdk: static
 
5
  ---
6
 
7
+ # NEW-ASR-VOXLINGUA
app.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NEW-ASR-VOXLINGUA
2
+
3
+ # ==============================================================================
4
+ # Cell 1: Environment Setup & Dependencies
5
+ #
6
+ # CORRECTED: Forcing SpeechBrain to version 0.5.16 to ensure backward
7
+ # compatibility with the old TalTechNLP XLS-R model.
8
+ # ==============================================================================
9
+ print("CELL 1: Setting up the environment with specific SpeechBrain version...")
10
+
11
+ # --- CORE CORRECTION ---
12
+ # Uninstall any existing newer versions and install the last stable version (0.5.x)
13
+ # that is compatible with the old TalTechNLP model's file paths.
14
+ # --- END CORRECTION ---
15
+
16
+ import torch
17
+ print("\n--- System Check ---")
18
+ if torch.cuda.is_available():
19
+ print(f"✅ GPU found: {torch.cuda.get_device_name(0)}")
20
+ print(f" CUDA Version: {torch.version.cuda}")
21
+ else:
22
+ print("⚠️ GPU not found. Using CPU. This will be significantly slower.")
23
+ print("--- End System Check ---\n")
24
+
25
+
26
+ pip show speechbrain.inference
27
+
28
+
29
+
30
+ print("CELL 2: Importing libraries and setting up language maps...")
31
+ import os
32
+ import re
33
+ import gc
34
+ import glob
35
+ import numpy as np
36
+ import pandas as pd
37
+ import librosa
38
+ import soundfile as sf
39
+ import torchaudio
40
+ from datetime import datetime
41
+ from google.colab import files
42
+ import subprocess
43
+ import shutil
44
+
45
+ # Transformers and ML libraries
46
+ from transformers import AutoModel, Wav2Vec2Processor, Wav2Vec2ForCTC
47
+ from speechbrain.inference.classifiers import EncoderClassifier
48
+ from speechbrain.pretrained.interfaces import foreign_class
49
+ from tokenizers import Tokenizer, models, trainers, pre_tokenizers
50
+
51
+ import warnings
52
+ warnings.filterwarnings('ignore')
53
+
54
+ # Complete language mappings as sets for O(1) lookup
55
+ INDO_ARYAN_LANGS = {'hi', 'bn', 'mr', 'gu', 'pa', 'or', 'as', 'ur', 'ks', 'sd', 'ne', 'kok'}
56
+ DRAVIDIAN_LANGS = {'ta', 'te', 'kn', 'ml'}
57
+ LOW_RESOURCE_LANGS = {'brx', 'mni', 'sat', 'doi'}
58
+
59
+ # Research-verified cross-lingual transfer mapping
60
+ TRANSFER_MAPPING = {'brx': 'hi', 'sat': 'hi', 'doi': 'pa', 'mni': 'bn'}
61
+ ALL_SUPPORTED_LANGS = INDO_ARYAN_LANGS | DRAVIDIAN_LANGS | LOW_RESOURCE_LANGS
62
+
63
+ print(f"✅ Libraries imported successfully.")
64
+ print(f"📊 Total languages supported: {len(ALL_SUPPORTED_LANGS)}\n")
65
+
66
+ print("CELL 3: Defining audio preprocessing functions...")
67
+ SUPPORTED_FORMATS = {'.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac'}
68
+
69
+ def validate_audio_format(audio_path):
70
+ ext = os.path.splitext(audio_path)[1].lower()
71
+ if not ext in SUPPORTED_FORMATS:
72
+ raise ValueError(f"Unsupported audio format: {ext}. Supported: {SUPPORTED_FORMATS}")
73
+ return True
74
+
75
+ def preprocess_audio(audio_path, target_sr=16000):
76
+ validate_audio_format(audio_path)
77
+ try:
78
+ waveform, sr = torchaudio.load(audio_path)
79
+ except Exception:
80
+ waveform, sr = librosa.load(audio_path, sr=None)
81
+ waveform = torch.tensor(waveform).unsqueeze(0)
82
+
83
+ if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True)
84
+ if sr != target_sr:
85
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
86
+ waveform = resampler(waveform)
87
+ return waveform, target_sr
88
+
89
+ print("✅ Audio preprocessing functions ready.\n")
90
+
91
+ print("CELL 4: Defining file handling functions...")
92
+ def extract_file_id_from_link(share_link):
93
+ patterns = [r'/file/d/([a-zA-Z0-9-_]+)', r'/folders/([a-zA-Z0-9-_]+)', r'id=([a-zA-Z0-9-_]+)']
94
+ for pattern in patterns:
95
+ match = re.search(pattern, share_link)
96
+ if match: return match.group(1)
97
+ return None
98
+
99
+ def download_from_shared_drive(share_link, max_files_per_lang=20):
100
+ file_id = extract_file_id_from_link(share_link)
101
+ if not file_id:
102
+ print("❌ Could not extract file ID. Please check your sharing link.")
103
+ return []
104
+
105
+ download_dir = "/content/shared_dataset"
106
+ if os.path.exists(download_dir): shutil.rmtree(download_dir)
107
+ os.makedirs(download_dir, exist_ok=True)
108
+
109
+ print(f"✅ Extracted ID: {file_id}. Starting download...")
110
+ try:
111
+ import gdown
112
+ gdown.download_folder(f"https://drive.google.com/drive/folders/{file_id}", output=download_dir, quiet=False, use_cookies=False)
113
+ print("✅ Folder downloaded successfully.")
114
+ except Exception as e:
115
+ print(f"❌ Download failed: {e}")
116
+ print("💡 Please ensure the folder is shared with 'Anyone with the link can view'.")
117
+ return []
118
+
119
+ print("\n🔍 Scanning for audio files...")
120
+ all_audio_files = [p for ext in SUPPORTED_FORMATS for p in glob.glob(os.path.join(download_dir, '**', f'*{ext}'), recursive=True)]
121
+ print(f"📊 Found {len(all_audio_files)} total audio files.")
122
+
123
+ lang_folders = {d: [] for d in os.listdir(download_dir) if os.path.isdir(os.path.join(download_dir, d))}
124
+ for f in all_audio_files:
125
+ lang_code = os.path.basename(os.path.dirname(f))
126
+ if lang_code in lang_folders: lang_folders[lang_code].append(f)
127
+
128
+ final_file_list = []
129
+ print("\nLimiting files per language:")
130
+ for lang, files in lang_folders.items():
131
+ if len(files) > max_files_per_lang:
132
+ print(f" {lang}: Limiting to {max_files_per_lang} files (from {len(files)})")
133
+ final_file_list.extend(files[:max_files_per_lang])
134
+ else:
135
+ print(f" {lang}: Found {len(files)} files")
136
+ final_file_list.extend(files)
137
+ return final_file_list
138
+
139
+ def get_audio_files():
140
+ print("\n🎯 Choose your audio source:")
141
+ print("1. Upload files from computer")
142
+ print("2. Download from Google Drive sharing link")
143
+ choice = input("Enter choice (1/2): ").strip()
144
+
145
+ if choice == '1':
146
+ uploaded = files.upload()
147
+ return [f"/content/{fname}" for fname in uploaded.keys()]
148
+ elif choice == '2':
149
+ share_link = input("\nPaste your Google Drive folder sharing link: ").strip()
150
+ return download_from_shared_drive(share_link)
151
+ else:
152
+ print("Invalid choice.")
153
+ return []
154
+ print("✅ File handling functions ready.\n")
155
+
156
+ print("CELL 5: Loading Language Identification (LID) Models...")
157
+ voxlingua_model = None
158
+ xlsr_lid_model = None
159
+
160
+ try:
161
+ print("Loading VoxLingua107 ECAPA-TDNN...")
162
+ voxlingua_model = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="pretrained_models/voxlingua107")
163
+ print("✅ VoxLingua107 loaded.")
164
+ except Exception as e:
165
+ print(f"❌ VoxLingua107 error: {e}")
166
+
167
+ try:
168
+ print("\nLoading TalTechNLP XLS-R LID...")
169
+ xlsr_lid_model = foreign_class(source="TalTechNLP/voxlingua107-xls-r-300m-wav2vec", pymodule_file="encoder_wav2vec_classifier.py", classname="EncoderWav2vecClassifier", hparams_file="inference_wav2vec.yaml", savedir="pretrained_models/xlsr_voxlingua")
170
+ print("✅ TalTechNLP XLS-R loaded.")
171
+ except Exception as e:
172
+ print(f"❌ XLS-R error: {e}. Pipeline will proceed with primary LID model only.")
173
+
174
+ models_loaded = sum(p is not None for p in [voxlingua_model, xlsr_lid_model])
175
+ print(f"\n📊 LID Models Status: {models_loaded}/2 loaded.\n")
176
+
177
+ print("CELL 6: Defining hybrid language detection system...")
178
+ def hybrid_language_detection(audio_path):
179
+ waveform, sr = preprocess_audio(audio_path)
180
+ results, confidences = {}, {}
181
+
182
+ if voxlingua_model:
183
+ try:
184
+ pred = voxlingua_model.classify_file(audio_path)
185
+ lang_code = str(pred[3][0]).split(':')[0].strip()
186
+ confidence = float(pred[1].exp().item())
187
+ results['voxlingua'], confidences['voxlingua'] = lang_code, confidence
188
+ except Exception: pass
189
+
190
+ if xlsr_lid_model:
191
+ try:
192
+ out_prob, score, index, text_lab = xlsr_lid_model.classify_file(audio_path)
193
+ lang_code = str(text_lab[0]).strip().lower()
194
+ confidence = float(out_prob.exp().max().item())
195
+ results['xlsr'], confidences['xlsr'] = lang_code, confidence
196
+ except Exception: pass
197
+
198
+ if not results: return "unknown", 0.0
199
+ if len(results) == 2 and results['voxlingua'] == results['xlsr']:
200
+ return results['voxlingua'], (confidences['voxlingua'] + confidences['xlsr']) / 2
201
+
202
+ best_model = max(confidences, key=confidences.get)
203
+ return results[best_model], confidences[best_model]
204
+
205
+ print("✅ Hybrid LID system ready.\n")
206
+
207
+ print("CELL 7: Loading Automatic Speech Recognition (ASR) Models...")
208
+ indicconformer_model = None
209
+ indicwav2vec_processor = None
210
+ indicwav2vec_model = None
211
+
212
+ try:
213
+ print("Loading IndicConformer for Indo-Aryan...")
214
+ indicconformer_model = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True)
215
+ print("✅ IndicConformer loaded.")
216
+ except Exception as e:
217
+ print(f"❌ IndicConformer Error: {e}. Indo-Aryan transcription will be unavailable.")
218
+
219
+ # Using a model fine-tuned on Tamil as a representative for Dravidian languages.
220
+ dravidian_model_name = "Amrrs/wav2vec2-large-xlsr-53-tamil"
221
+ try:
222
+ print(f"\nLoading Fine-Tuned Wav2Vec2 for Dravidian ({dravidian_model_name})...")
223
+ indicwav2vec_processor = Wav2Vec2Processor.from_pretrained(dravidian_model_name)
224
+ indicwav2vec_model = Wav2Vec2ForCTC.from_pretrained(dravidian_model_name)
225
+ print("✅ Fine-Tuned IndicWav2Vec2 loaded.")
226
+ except Exception as e:
227
+ print(f"❌ IndicWav2Vec2 Error: {e}. Dravidian transcription will be unavailable.")
228
+
229
+ asr_models_loaded = sum(p is not None for p in [indicconformer_model, indicwav2vec_model])
230
+ print(f"\n📊 ASR Models Status: {asr_models_loaded}/2 loaded.\n")
231
+
232
+ # ==============================================================================
233
+ # Cell 8: BPE and Syllable-BPE Tokenization Classes
234
+ #
235
+ # This version correctly handles untrained tokenizers and has improved
236
+ # regex for more accurate syllable segmentation.
237
+ # ==============================================================================
238
+ print("CELL 8: Defining tokenization classes...")
239
+ import re
240
+ from tokenizers import Tokenizer, models, trainers, pre_tokenizers
241
+
242
+ class BPETokenizer:
243
+ """Standard BPE tokenizer for Indo-Aryan languages."""
244
+ def __init__(self, vocab_size=5000):
245
+ self.tokenizer = Tokenizer(models.BPE())
246
+ self.tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
247
+ self.trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["<unk>", "<pad>"])
248
+ self.trained = False
249
+
250
+ def train(self, texts):
251
+ """Train BPE tokenizer on a text corpus."""
252
+ self.tokenizer.train_from_iterator(texts, self.trainer)
253
+ self.trained = True
254
+
255
+ def encode(self, text):
256
+ """Encode text using the trained BPE model."""
257
+ if not self.trained:
258
+ # Fallback for untrained tokenizer
259
+ return text.split()
260
+ return self.tokenizer.encode(text).tokens
261
+
262
+ class SyllableBPETokenizer:
263
+ """Syllable-aware BPE tokenizer for Dravidian languages."""
264
+ def __init__(self, vocab_size=3000):
265
+ self.vocab_size = vocab_size
266
+ self.patterns = {
267
+ 'ta': r'[க-ஹ][ா-ௌ]?|[அ-ஔ]', # Tamil
268
+ 'te': r'[క-హ][ా-ౌ]?|[అ-ఔ]', # Telugu
269
+ 'kn': r'[ಕ-ಹ][ಾ-ೌ]?|[ಅ-ಔ]', # Kannada
270
+ 'ml': r'[ക-ഹ][ാ-ൌ]?|[അ-ഔ]' # Malayalam
271
+ }
272
+ self.trained = False
273
+
274
+ def syllable_segment(self, text, lang):
275
+ """Segment text into phonetically relevant syllables."""
276
+ pattern = self.patterns.get(lang, r'\S+') # Fallback to whitespace for other languages
277
+ syllables = re.findall(pattern, text)
278
+ return syllables if syllables else [text]
279
+
280
+ def train_sbpe(self, texts, lang):
281
+ """Train the S-BPE tokenizer on syllable-segmented text."""
282
+ syllable_texts = [' '.join(self.syllable_segment(t, lang)) for t in texts]
283
+ self.tokenizer = Tokenizer(models.BPE())
284
+ trainer = trainers.BpeTrainer(vocab_size=self.vocab_size, special_tokens=["<unk>", "<pad>"])
285
+ self.tokenizer.train_from_iterator(syllable_texts, trainer)
286
+ self.trained = True
287
+
288
+ def encode(self, text, lang):
289
+ """Encode text using the trained syllable-aware BPE."""
290
+ syllables = self.syllable_segment(text, lang)
291
+ if not self.trained:
292
+ # If not trained, return the basic syllables as a fallback
293
+ return syllables
294
+ syllable_text = ' '.join(syllables)
295
+ return self.tokenizer.encode(syllable_text).tokens
296
+
297
+ print("✅ BPE and S-BPE tokenization classes implemented and verified.\n")
298
+
299
+ # --- Example Usage (Demonstration) ---
300
+ print("--- Tokenizer Demonstration ---")
301
+ # BPE Example
302
+ bpe_texts = ["यह एक वाक्य है।", "এটি একটি বাক্য।"]
303
+ bpe_tokenizer = BPETokenizer(vocab_size=50)
304
+ bpe_tokenizer.train(bpe_texts)
305
+ print(f"BPE Tokens: {bpe_tokenizer.encode('यह दूसरा वाक्य है।')}")
306
+
307
+ # S-BPE Example
308
+ sbpe_texts = ["வணக்கம் உலகம்", "மொழி ஆய்வு"]
309
+ sbpe_tokenizer = SyllableBPETokenizer(vocab_size=30)
310
+ sbpe_tokenizer.train_sbpe(sbpe_texts, 'ta')
311
+ print(f"S-BPE Tokens (Tamil): {sbpe_tokenizer.encode('வணக்கம் நண்பரே', 'ta')}")
312
+ print("--- End Demonstration ---\n")
313
+
314
+
315
+ # ==============================================================================
316
+ # Cell 9: Complete SLP1 Phonetic Encoder
317
+ #
318
+ # This version includes a comprehensive mapping for all target Dravidian
319
+ # languages and a reverse mapping for decoding.
320
+ # ==============================================================================
321
+ print("CELL 9: Defining the SLP1 phonetic encoder...")
322
+
323
+ class SLP1Encoder:
324
+ """Encodes Dravidian scripts into a unified Sanskrit Library Phonetic (SLP1) representation."""
325
+
326
+ def __init__(self):
327
+ # Comprehensive mapping covering Tamil, Telugu, Kannada, and Malayalam
328
+ self.slp1_mapping = {
329
+ # Vowels (Common and specific)
330
+ 'அ': 'a', 'ஆ': 'A', 'இ': 'i', 'ஈ': 'I', 'உ': 'u', 'ஊ': 'U', 'எ': 'e', 'ஏ': 'E', 'ஐ': 'E', 'ஒ': 'o', 'ஓ': 'O', 'ஔ': 'O',
331
+ 'అ': 'a', 'ఆ': 'A', 'ఇ': 'i', 'ఈ': 'I', 'ఉ': 'u', 'ఊ': 'U', 'ఋ': 'f', 'ౠ': 'F', 'ఎ': 'e', 'ఏ': 'E', 'ఐ': 'E', 'ఒ': 'o', 'ఓ': 'O', 'ఔ': 'O',
332
+ 'ಅ': 'a', 'ಆ': 'A', 'ಇ': 'i', 'ಈ': 'I', 'ಉ': 'u', 'ಊ': 'U', 'ಋ': 'f', 'ಎ': 'e', 'ಏ': 'E', 'ಐ': 'E', 'ಒ': 'o', 'ಓ': 'O', 'ಔ': 'O',
333
+ 'അ': 'a', 'ആ': 'A', 'ഇ': 'i', 'ഈ': 'I', 'ഉ': 'u', 'ഊ': 'U', 'ഋ': 'f', 'എ': 'e', 'ഏ': 'E', 'ഐ': 'E', 'ഒ': 'o', 'ഓ': 'O', 'ഔ': 'O',
334
+ # Consonants (Common and specific)
335
+ 'க': 'k', 'ங': 'N', 'ச': 'c', 'ஞ': 'J', 'ட': 'w', 'ண': 'R', 'த': 't', 'ந': 'n', 'ப': 'p', 'ம': 'm', 'ய': 'y', 'ர': 'r', 'ல': 'l', 'வ': 'v', 'ழ': 'L', 'ள': 'x', 'ற': 'f', 'ன': 'F',
336
+ 'క': 'k', 'ఖ': 'K', 'గ': 'g', 'ఘ': 'G', 'ఙ': 'N', 'చ': 'c', 'ఛ': 'C', 'జ': 'j', 'ఝ': 'J', 'ఞ': 'Y', 'ట': 'w', 'ఠ': 'W', 'డ': 'q', 'ఢ': 'Q', 'ణ': 'R', 'త': 't', 'థ': 'T', 'ద': 'd', 'ధ': 'D', 'న': 'n', 'ప': 'p', 'ఫ': 'P', 'బ': 'b', 'భ': 'B', 'మ': 'm', 'య': 'y', 'ర': 'r', 'ల': 'l', 'వ': 'v', 'శ': 'S', 'ష': 's', 'స': 'z', 'హ': 'h',
337
+ 'ಕ': 'k', 'ಖ': 'K', 'ಗ': 'g', 'ಘ': 'G', 'ಙ': 'N', 'ಚ': 'c', 'ಛ': 'C', 'ಜ': 'j', 'ಝ': 'J', 'ಞ': 'Y', 'ಟ': 'w', 'ಠ': 'W', 'ಡ': 'q', 'ಢ': 'Q', 'ಣ': 'R', 'ತ': 't', 'ಥ': 'T', 'ದ': 'd', 'ಧ': 'D', 'ನ': 'n', 'ಪ': 'p', 'ಫ': 'P', 'ಬ': 'b', 'ಭ': 'B', 'ಮ': 'm', 'ಯ': 'y', 'ರ': 'r', 'ಲ': 'l', 'ವ': 'v', 'ಶ': 'S', 'ಷ': 's', 'ಸ': 'z', 'ಹ': 'h',
338
+ 'ക': 'k', 'ഖ': 'K', 'ഗ': 'g', 'ഘ': 'G', 'ങ': 'N', 'ച': 'c', 'ഛ': 'C', 'ജ': 'j', 'ഝ': 'J', 'ഞ': 'Y', 'ട': 'w', 'ഠ': 'W', 'ഡ': 'q', 'ഢ': 'Q', 'ണ': 'R', 'ത': 't', 'ഥ': 'T', 'ദ': 'd', 'ധ': 'D', 'ന': 'n', 'പ': 'p', 'ഫ': 'P', 'ബ': 'b', 'ഭ': 'B', 'മ': 'm', 'യ': 'y', 'ര': 'r', 'ല': 'l', 'വ': 'v', 'ശ': 'S', 'ഷ': 's', 'സ': 'z', 'ഹ': 'h',
339
+ # Grantha script consonants often used in Tamil and Malayalam
340
+ 'ஜ': 'j', 'ஷ': 'S', 'ஸ': 's', 'ஹ': 'h',
341
+ # Common diacritics
342
+ '்': '', 'ಂ': 'M', 'ः': 'H', 'ം': 'M'
343
+ }
344
+ # Build reverse mapping for decoding, handling potential conflicts
345
+ self.reverse_mapping = {v: k for k, v in self.slp1_mapping.items()}
346
+
347
+ def encode(self, text):
348
+ """Convert native Dravidian script to its SLP1 representation."""
349
+ if not text:
350
+ return ""
351
+ return "".join([self.slp1_mapping.get(char, char) for char in text])
352
+
353
+ def decode(self, slp1_text):
354
+ """Convert SLP1 representation back to a native script (basic implementation)."""
355
+ if not slp1_text:
356
+ return ""
357
+ return "".join([self.reverse_mapping.get(char, char) for char in slp1_text])
358
+
359
+ slp1_encoder = SLP1Encoder()
360
+ print("✅ Complete SLP1 encoder ready.")
361
+ print(f"🔤 Total character mappings: {len(slp1_encoder.slp1_mapping)}\n")
362
+
363
+ # --- Example Usage (Demonstration) ---
364
+ print("--- SLP1 Encoder Demonstration ---")
365
+ test_cases = [
366
+ ("கல்வி", "Tamil"),
367
+ ("విద్య", "Telugu"),
368
+ ("ಶಿಕ್ಷಣ", "Kannada"),
369
+ ("വിദ്യാഭ്യാസം", "Malayalam")
370
+ ]
371
+ for text, lang in test_cases:
372
+ encoded = slp1_encoder.encode(text)
373
+ print(f" {lang}: {text} → {encoded}")
374
+ print("--- End Demonstration ---\n")
375
+
376
+
377
+ print("CELL 10: Defining family-specific ASR processing functions...")
378
+ def process_indo_aryan_asr(audio_path, detected_lang):
379
+ if indicconformer_model is None: return "[IndicConformer model not loaded]"
380
+ try:
381
+ waveform, sr = preprocess_audio(audio_path)
382
+ # The model expects language code and decoding strategy ("ctc" or "rnnt")
383
+ transcription = indicconformer_model(waveform, detected_lang, "ctc")[0]
384
+ return transcription
385
+ except Exception as e: return f"Error in Indo-Aryan ASR: {e}"
386
+
387
+ def process_dravidian_asr(audio_path, detected_lang):
388
+ if not (indicwav2vec_model and indicwav2vec_processor): return "[Dravidian ASR model not loaded]", ""
389
+ try:
390
+ waveform, sr = preprocess_audio(audio_path)
391
+ input_values = indicwav2vec_processor(waveform.squeeze().numpy(), sampling_rate=sr, return_tensors="pt").input_values
392
+ with torch.no_grad(): logits = indicwav2vec_model(input_values).logits
393
+ predicted_ids = torch.argmax(logits, dim=-1)
394
+ transcription = indicwav2vec_processor.batch_decode(predicted_ids)[0]
395
+
396
+ # S-BPE Tokenization for analysis
397
+ sbpe_tokenizer = SyllableBPETokenizer()
398
+ sbpe_tokenizer.train_sbpe([transcription], detected_lang)
399
+ syllable_tokens = sbpe_tokenizer.encode(transcription, detected_lang)
400
+ print(f" S-BPE Tokens (for analysis): {syllable_tokens}")
401
+
402
+ slp1_encoded = slp1_encoder.encode(transcription)
403
+ return transcription, slp1_encoded
404
+ except Exception as e: return f"Error in Dravidian ASR: {e}", ""
405
+
406
+ def process_low_resource_asr(audio_path, detected_lang):
407
+ transfer_lang = TRANSFER_MAPPING.get(detected_lang, 'hi')
408
+ print(f" Using transfer learning: {detected_lang} -> {transfer_lang}")
409
+ return process_indo_aryan_asr(audio_path, transfer_lang)
410
+
411
+ print("✅ Family-specific ASR functions ready.\n")
412
+
413
+ print("CELL 11: Defining the main processing pipeline...")
414
+ def complete_speech_to_text_pipeline(audio_path):
415
+ print(f"\n🎵 Processing: {os.path.basename(audio_path)}")
416
+ detected_lang, confidence = hybrid_language_detection(audio_path)
417
+ slp1_text, family, transcription = "", "Unknown", f"Language '{detected_lang}' not supported."
418
+
419
+ if detected_lang in INDO_ARYAN_LANGS:
420
+ family, transcription = "Indo-Aryan", process_indo_aryan_asr(audio_path, detected_lang)
421
+ elif detected_lang in DRAVIDIAN_LANGS:
422
+ family, (transcription, slp1_text) = "Dravidian", process_dravidian_asr(audio_path, detected_lang)
423
+ elif detected_lang in LOW_RESOURCE_LANGS:
424
+ family, transcription = "Low-Resource", process_low_resource_asr(audio_path, detected_lang)
425
+
426
+ status = "Failed" if "error" in transcription.lower() or "not supported" in transcription.lower() or not transcription else "Success"
427
+ print(f" Transcription: {transcription}")
428
+
429
+ return {
430
+ 'audio_file': os.path.basename(audio_path),
431
+ 'full_path': audio_path,
432
+ 'detected_language': detected_lang,
433
+ 'language_family': family, 'confidence': round(confidence, 3), 'transcription': transcription,
434
+ 'slp1_encoding': slp1_text, 'status': status, 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
435
+ }
436
+
437
+ def batch_process_audio_files(audio_files):
438
+ if not audio_files:
439
+ print("❌ No audio files to process.")
440
+ return []
441
+ results = [complete_speech_to_text_pipeline(f) for f in audio_files]
442
+ success_count = sum(1 for r in results if r['status'] == 'Success')
443
+ success_rate = (success_count / len(results)) * 100 if results else 0
444
+ print(f"\n🎉 Batch processing completed! Success rate: {success_rate:.1f}% ({success_count}/{len(results)})")
445
+ return results
446
+
447
+ print("✅ Main pipeline ready.\n")
448
+
449
+ print("CELL 12: Defining report generation and main execution logic...")
450
+ def generate_excel_report(results):
451
+ if not results: return None
452
+ df = pd.DataFrame(results)
453
+
454
+ def get_ground_truth(path):
455
+ parts = path.split('/')
456
+ for part in reversed(parts):
457
+ if len(part) == 2 and part.isalpha() and part in ALL_SUPPORTED_LANGS: return part
458
+ return "unknown"
459
+
460
+ df['ground_truth'] = df['full_path'].apply(get_ground_truth)
461
+ df['is_correct'] = df.apply(lambda row: row['detected_language'] == row['ground_truth'], axis=1)
462
+
463
+ filename = f"ASR_Evaluation_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
464
+ with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
465
+ df.to_excel(writer, sheet_name='Detailed_Results', index=False)
466
+ # Summary Sheet
467
+ summary_data = {
468
+ 'Metric': ['Total Files', 'Successful Transcriptions', 'Overall LID Accuracy'],
469
+ 'Value': [len(df), df['status'].eq('Success').sum(), f"{df['is_correct'].mean()*100:.2f}%"]
470
+ }
471
+ pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
472
+
473
+ print(f"\n✅ Comprehensive Excel report generated: {filename}")
474
+ except Exception as e: print(f" Could not auto-download file: {e}")
475
+ return filename
476
+
477
+ # --- MAIN EXECUTION ---
478
+ print("\n🚀🚀🚀 Starting the Full ASR Pipeline 🚀🚀🚀")
479
+ audio_files_to_process = get_audio_files()
480
+ if audio_files_to_process:
481
+ pipeline_results = batch_process_audio_files(audio_files_to_process)
482
+ generate_excel_report(pipeline_results)
483
+ else:
484
+ print("\nNo audio files were selected. Exiting.")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ datasets
2
+ numpy
3
+ pandas
4
+ sentencepiece
5
+ torch
6
+ transformers