Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | |
| import pandas as pd | |
| import time | |
| import re | |
| import tempfile | |
| import os | |
| import uuid | |
| # Model loading | |
| model_name = "dsfsi/nso-en-m2m100-gov" | |
| tokenizer = M2M100Tokenizer.from_pretrained(model_name) | |
| model = M2M100ForConditionalGeneration.from_pretrained(model_name) | |
| tokenizer.src_lang = "ns" | |
| model.config.forced_bos_token_id = tokenizer.get_lang_id("en") | |
| # Translation function (single) | |
| def translate_nso_en(text): | |
| if not text.strip(): | |
| return "Please enter Northern Sotho (Sepedi) text." | |
| inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True) | |
| translated_tokens = model.generate( | |
| **inputs, | |
| max_length=512, | |
| forced_bos_token_id=tokenizer.get_lang_id("en") | |
| ) | |
| return tokenizer.decode(translated_tokens[0], skip_special_tokens=True) | |
| # Linguistic analysis | |
| def calculate_metrics(text): | |
| words = text.split() | |
| word_count = len(words) | |
| char_count = len(text) | |
| sentence_count = len([s for s in re.split(r'[.!?]+', text) if s.strip()]) | |
| unique_words = len(set(words)) | |
| avg_word_length = sum(len(w) for w in words) / word_count if word_count else 0 | |
| lexical_div = unique_words / word_count if word_count else 0 | |
| return { | |
| 'char_count': char_count, | |
| 'word_count': word_count, | |
| 'sentence_count': sentence_count, | |
| 'unique_words': unique_words, | |
| 'avg_word_length': avg_word_length, | |
| 'lexical_diversity': lexical_div | |
| } | |
| def create_metrics_table(src_metrics, tgt_metrics): | |
| data = { | |
| 'Metric': ['Words', 'Characters', 'Sentences', 'Unique Words', 'Avg Word Length', 'Lexical Diversity'], | |
| 'Source Text': [ | |
| src_metrics.get('word_count', 0), | |
| src_metrics.get('char_count', 0), | |
| src_metrics.get('sentence_count', 0), | |
| src_metrics.get('unique_words', 0), | |
| f"{src_metrics.get('avg_word_length', 0):.1f}", | |
| f"{src_metrics.get('lexical_diversity', 0):.3f}" | |
| ], | |
| 'Target Text': [ | |
| tgt_metrics.get('word_count', 0), | |
| tgt_metrics.get('char_count', 0), | |
| tgt_metrics.get('sentence_count', 0), | |
| tgt_metrics.get('unique_words', 0), | |
| f"{tgt_metrics.get('avg_word_length', 0):.1f}", | |
| f"{tgt_metrics.get('lexical_diversity', 0):.3f}" | |
| ] | |
| } | |
| return pd.DataFrame(data) | |
| def translate_and_analyze(text): | |
| if not text.strip(): | |
| return "Please enter Northern Sotho (Sepedi) text.", "No analysis available.", create_metrics_table({}, {}) | |
| start = time.time() | |
| translated = translate_nso_en(text) | |
| src_metrics = calculate_metrics(text) | |
| tgt_metrics = calculate_metrics(translated) | |
| elapsed = time.time() - start | |
| report = f"""## π Linguistic Analysis Report | |
| ### Translation Details | |
| - **Processing Time**: {elapsed:.2f} seconds | |
| ### Text Complexity Metrics | |
| | Metric | Source | Target | Ratio | | |
| |--------|--------|--------|-------| | |
| | Word Count | {src_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0) / max(src_metrics.get('word_count', 1), 1):.2f} | | |
| | Character Count | {src_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0) / max(src_metrics.get('char_count', 1), 1):.2f} | | |
| | Sentence Count | {src_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0) / max(src_metrics.get('sentence_count', 1), 1):.2f} | | |
| | Avg Word Length | {src_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0) / max(src_metrics.get('avg_word_length', 1), 1):.2f} | | |
| | Lexical Diversity | {src_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0) / max(src_metrics.get('lexical_diversity', 0.001), 0.001):.2f} | | |
| """ | |
| table = create_metrics_table(src_metrics, tgt_metrics) | |
| return translated, report, table | |
| # Batch processing | |
| def secure_batch_processing(file_obj): | |
| if file_obj is None: | |
| return "Please upload a file.", pd.DataFrame() | |
| temp_dir = None | |
| try: | |
| session_id = str(uuid.uuid4()) | |
| temp_dir = tempfile.mkdtemp(prefix=f"translation_{session_id}_") | |
| file_ext = os.path.splitext(file_obj.name)[1].lower() | |
| if file_ext not in ['.txt', '.csv']: | |
| return "Only .txt and .csv files are supported.", pd.DataFrame() | |
| temp_file_path = os.path.join(temp_dir, f"upload_{session_id}{file_ext}") | |
| import shutil | |
| shutil.copy2(file_obj.name, temp_file_path) | |
| texts = [] | |
| if file_ext == '.csv': | |
| df = pd.read_csv(temp_file_path) | |
| if df.empty: | |
| return "The uploaded CSV file is empty.", pd.DataFrame() | |
| texts = df.iloc[:, 0].dropna().astype(str).tolist() | |
| else: | |
| with open(temp_file_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| texts = [line.strip() for line in content.split('\n') if line.strip()] | |
| if not texts: | |
| return "No text found in the uploaded file.", pd.DataFrame() | |
| max_batch_size = 10 | |
| if len(texts) > max_batch_size: | |
| texts = texts[:max_batch_size] | |
| warning_msg = f"Processing limited to first {max_batch_size} entries for performance." | |
| else: | |
| warning_msg = "" | |
| results = [] | |
| for i, text in enumerate(texts): | |
| if len(text.strip()) == 0: | |
| continue | |
| if len(text) > 1000: | |
| text = text[:1000] + "..." | |
| translated = translate_nso_en(text) | |
| results.append({ | |
| 'Index': i + 1, | |
| 'Original': text[:100] + '...' if len(text) > 100 else text, | |
| 'Translation': translated[:100] + '...' if len(translated) > 100 else translated | |
| }) | |
| if not results: | |
| return "No valid text entries found to translate.", pd.DataFrame() | |
| results_df = pd.DataFrame(results) | |
| summary = f"Successfully processed {len(results)} text entries." | |
| if warning_msg: | |
| summary = f"{summary} {warning_msg}" | |
| return summary, results_df | |
| except Exception as e: | |
| return f"Error processing file: {str(e)}", pd.DataFrame() | |
| finally: | |
| if temp_dir and os.path.exists(temp_dir): | |
| try: | |
| import shutil | |
| shutil.rmtree(temp_dir) | |
| except Exception as e: | |
| print(f"Warning: Could not clean up temporary directory: {e}") | |
| # Examples | |
| EXAMPLES = [ | |
| ["Leina la ka ke Vukosi."], | |
| ["Ke leboga thuΕ‘o ya gago."], | |
| ["Re a go amogela mo Pretoria."], | |
| ["Go tloga ka letΕ‘atΕ‘i la lehono, dilo di tlo kaonafala."], | |
| ["O swanetΕ‘e go hwetΕ‘a thuΕ‘o ge go kgonega."], | |
| ["Ngwana o ya sekolong letΕ‘atΕ‘ing le lengwe le le lengwe."] | |
| ] | |
| # Research tools | |
| def detailed_analysis(text): | |
| if not text.strip(): | |
| return {} | |
| metrics = calculate_metrics(text) | |
| return { | |
| "basic_metrics": metrics, | |
| "text_length": len(text), | |
| "analysis_completed": True | |
| } | |
| def create_gradio_interface(): | |
| with gr.Blocks( | |
| title="π¬ Northern Sotho-English Linguistic Translation Tool", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container {font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;} | |
| .main-header {text-align: center; padding: 2rem 0;} | |
| .dsfsi-logo {text-align: center; margin-bottom: 1rem;} | |
| .dsfsi-logo img {max-width: 300px; height: auto;} | |
| .metric-table {font-size: 0.9em;} | |
| """ | |
| ) as demo: | |
| gr.HTML(""" | |
| <div class="dsfsi-logo"> | |
| <img src="https://www.dsfsi.co.za/images/logo_transparent_expanded.png" alt="DSFSI Logo" /> | |
| </div> | |
| <div class="main-header"> | |
| <h1>π¬ Northern Sotho-English Linguistic Translation Tool</h1> | |
| <p style="font-size: 1.1em; color: #666; max-width: 800px; margin: 0 auto;"> | |
| AI-powered translation system for Northern Sotho (Sepedi) to English with detailed linguistic analysis, designed for linguists, researchers, and language documentation projects. | |
| </p> | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| with gr.Tab("π Translation & Analysis"): | |
| gr.Markdown(""" | |
| ### Real-time Translation with Linguistic Analysis | |
| Translate from Northern Sotho (Sepedi) to English and get detailed linguistic insights. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_text = gr.Textbox( | |
| label="Northern Sotho (Sepedi) Input", | |
| placeholder="Enter text to translate...", | |
| lines=4, | |
| max_lines=10 | |
| ) | |
| translate_btn = gr.Button("π Translate & Analyze", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| output_text = gr.Textbox( | |
| label="Translation (English)", | |
| lines=4, | |
| interactive=False | |
| ) | |
| gr.Markdown("### π Example Translations") | |
| gr.Examples( | |
| examples=EXAMPLES, | |
| inputs=[input_text], | |
| label="Click an example to try it:" | |
| ) | |
| with gr.Accordion("π Detailed Linguistic Analysis", open=False): | |
| analysis_output = gr.Markdown(label="Analysis Report") | |
| with gr.Accordion("π Metrics Table", open=False): | |
| metrics_table = gr.Dataframe( | |
| label="Comparative Metrics", | |
| headers=["Metric", "Source Text", "Target Text"], | |
| interactive=False | |
| ) | |
| translate_btn.click( | |
| fn=translate_and_analyze, | |
| inputs=input_text, | |
| outputs=[output_text, analysis_output, metrics_table] | |
| ) | |
| with gr.Tab("π Batch Processing"): | |
| gr.Markdown(""" | |
| ### Secure Corpus Analysis & Batch Translation | |
| Upload text or CSV files for batch translation and analysis. Files are processed securely and temporarily. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_upload = gr.File( | |
| label="Upload File (Max 5MB)", | |
| file_types=[".txt", ".csv"], | |
| type="filepath", | |
| file_count="single" | |
| ) | |
| batch_btn = gr.Button("π Process Batch", variant="primary") | |
| gr.Markdown(""" | |
| **Supported formats:** | |
| - `.txt` files: One text per line | |
| - `.csv` files: Text in first column | |
| - **Security limits**: Max 10 entries, 1000 chars per text | |
| - **Privacy**: Files are deleted after processing | |
| """) | |
| with gr.Column(): | |
| batch_summary = gr.Textbox( | |
| label="Processing Summary", | |
| lines=3, | |
| interactive=False | |
| ) | |
| batch_results = gr.Dataframe( | |
| label="Translation Results", | |
| interactive=False, | |
| wrap=True | |
| ) | |
| batch_btn.click( | |
| fn=secure_batch_processing, | |
| inputs=file_upload, | |
| outputs=[batch_summary, batch_results] | |
| ) | |
| with gr.Tab("π¬ Research Tools"): | |
| gr.Markdown(""" | |
| ### Advanced Linguistic Analysis Tools | |
| Analyze text for linguistic features. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| research_text = gr.Textbox( | |
| label="Text for Analysis", | |
| lines=6, | |
| placeholder="Enter Northern Sotho (Sepedi) or English text...", | |
| max_lines=15 | |
| ) | |
| analyze_btn = gr.Button("π Analyze Text", variant="primary") | |
| with gr.Column(): | |
| research_output = gr.JSON( | |
| label="Detailed Analysis Results" | |
| ) | |
| analyze_btn.click( | |
| fn=detailed_analysis, | |
| inputs=research_text, | |
| outputs=research_output | |
| ) | |
| gr.Markdown(""" | |
| ### π£οΈ About Northern Sotho (Sepedi) Language | |
| **Northern Sotho (Sepedi)** is a Bantu language spoken by millions of people, primarily in: | |
| - πΏπ¦ **South Africa** β Official language | |
| **Key Linguistic Features:** | |
| - **Language Family**: Niger-Congo β Bantu β Sotho-Tswana | |
| - **Script**: Latin alphabet | |
| - **Characteristics**: Agglutinative, noun-class system | |
| - **ISO Code**: nso (ISO 639-2/3) | |
| """) | |
| gr.Markdown(""" | |
| --- | |
| ### π Model Information & Citation | |
| **Model Used:** [`dsfsi/nso-en-m2m100-gov`](https://huggingface.co/dsfsi/nso-en-m2m100-gov) | |
| Based on Meta's M2M100, fine-tuned specifically for Northern Sotho-English by the **Data Science for Social Impact Research Group**. | |
| **Training Data:** Vuk'uzenzele and ZA-gov-multilingual South African corpora. | |
| ### π Privacy & Security | |
| - No conversation history stored | |
| - Uploaded files deleted after processing | |
| - All processing in isolated temporary environments | |
| - No user data persistence | |
| ### π Acknowledgments | |
| We thank **Thapelo Sindani** and **Zion Nia Van Wyk** for their assistance in creating this space. | |
| ### π Citation | |
| ```bibtex | |
| @inproceedings{lastrucci-etal-2023-preparing, | |
| title = "Preparing the Vuk'uzenzele and ZA-gov-multilingual South African multilingual corpora", | |
| author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab | |
| and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate", | |
| booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)", | |
| pages = "18--25", | |
| year = "2023" | |
| } | |
| ``` | |
| **Links**: | |
| - [DSFSI](https://www.dsfsi.co.za/) | |
| - [Model](https://huggingface.co/dsfsi/nso-en-m2m100-gov) | |
| - [Vuk'uzenzele Data](https://github.com/dsfsi/vukuzenzele-nlp) | |
| - [ZA-gov Data](https://github.com/dsfsi/gov-za-multilingual) | |
| - [Research Feedback](https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform) | |
| --- | |
| **Built for the African NLP community** | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_gradio_interface() | |
| demo.launch( | |
| share=True, | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) | |