""" Arabic Tokenizer Arena Pro - Main Application ============================================== Advanced research & production platform for Arabic tokenization analysis Run with: python app.py """ import gradio as gr # Import modules from config import SAMPLE_TEXTS, LEADERBOARD_DATASETS from styles import CUSTOM_CSS from tokenizer_manager import tokenizer_manager from analysis import analyze_single_tokenizer, compare_tokenizers from leaderboard import run_leaderboard_evaluation, evaluate_submitted_tokenizer, get_cached_leaderboard from ui_components import generate_about_html def create_interface(): """Create the Gradio interface""" available_tokenizers = tokenizer_manager.get_tokenizer_choices() tokenizers_by_type = tokenizer_manager.get_tokenizers_by_type() with gr.Blocks( css=CUSTOM_CSS, title="Arabic Tokenizer Arena Pro", theme=gr.themes.Base( primary_hue="green", secondary_hue="blue", neutral_hue="slate", font=["IBM Plex Sans Arabic", "system-ui", "sans-serif"] ) ) as demo: # Header gr.HTML("""

🏟️ Arabic Tokenizer Arena Pro

Advanced research & production platform for Arabic tokenization analysis

""") with gr.Tabs(): # ===== TAB 1: Single Tokenizer Analysis ===== with gr.TabItem("🔬 Single Analysis", id="single"): with gr.Row(): with gr.Column(scale=1): tokenizer_dropdown = gr.Dropdown( choices=available_tokenizers, value=available_tokenizers[0] if available_tokenizers else None, label="Select Tokenizer", info="Choose a tokenizer to analyze" ) sample_dropdown = gr.Dropdown( choices=list(SAMPLE_TEXTS.keys()), label="Sample Texts", info="Select a sample or enter custom text" ) input_text = gr.Textbox( lines=4, placeholder="اكتب النص العربي هنا...\nEnter Arabic text here...", label="Input Text", rtl=True ) analyze_btn = gr.Button("🔍 Analyze", variant="primary", size="lg") with gr.Column(scale=2): info_output = gr.HTML(label="Tokenizer Information") metrics_output = gr.HTML(label="Evaluation Metrics") tokens_output = gr.HTML(label="Token Visualization") decoded_output = gr.HTML(label="Decoded Output") sample_dropdown.change( lambda x: SAMPLE_TEXTS.get(x, ""), inputs=[sample_dropdown], outputs=[input_text] ) analyze_btn.click( analyze_single_tokenizer, inputs=[tokenizer_dropdown, input_text], outputs=[info_output, metrics_output, tokens_output, decoded_output] ) # ===== TAB 2: Comparison Mode ===== with gr.TabItem("⚖️ Compare Tokenizers", id="compare"): with gr.Row(): with gr.Column(scale=1): compare_tokenizers_select = gr.CheckboxGroup( choices=available_tokenizers, value=available_tokenizers[:5] if len(available_tokenizers) >= 5 else available_tokenizers, label="Select Tokenizers to Compare", info="Choose 2 or more tokenizers" ) compare_sample = gr.Dropdown( choices=list(SAMPLE_TEXTS.keys()), label="Sample Texts" ) compare_text = gr.Textbox( lines=4, placeholder="اكتب النص العربي هنا...", label="Input Text", rtl=True ) compare_btn = gr.Button("⚖️ Compare", variant="primary", size="lg") with gr.Column(scale=2): comparison_output = gr.HTML(label="Comparison Results") compare_sample.change( lambda x: SAMPLE_TEXTS.get(x, ""), inputs=[compare_sample], outputs=[compare_text] ) compare_btn.click( compare_tokenizers, inputs=[compare_tokenizers_select, compare_text], outputs=[comparison_output] ) # ===== TAB 3: LEADERBOARD ===== with gr.TabItem("🏆 Leaderboard", id="leaderboard"): gr.Markdown(""" ## 🏆 Arabic Tokenizer Leaderboard All tokenizers evaluated on **all 8 Arabic datasets** from HuggingFace (~36,000+ samples total). """) with gr.Row(): status_output = gr.Markdown("⏳ Loading cached results...") re_evaluate_btn = gr.Button("🔄 Re-evaluate All", variant="secondary", size="sm") gr.Markdown("### 📊 Leaderboard Results") leaderboard_output = gr.HTML() gr.Markdown("### 📈 Per-Dataset Breakdown") per_dataset_output = gr.HTML() re_evaluate_btn.click( fn=run_leaderboard_evaluation, inputs=[], outputs=[leaderboard_output, per_dataset_output, status_output] ) gr.Markdown(""" --- ### 📖 Evaluation Datasets | Dataset | Category | Samples | |---------|----------|---------| | ArabicMMLU | MSA Benchmark | 5,000 | | ASTD | Egyptian Dialect | 5,000 | | ATHAR | Classical Arabic | 5,000 | | ARCD | QA Dataset | 1,395 | | Ashaar | Poetry | 5,000 | | Hadith | Religious | 5,000 | | Arabic Sentiment | Social Media | 5,000 | | SANAD | News | 5,000 | """) # ===== TAB 4: Metrics Reference ===== with gr.TabItem("📖 Metrics Guide", id="guide"): gr.Markdown(""" ## Tokenization Evaluation Metrics Guide ### Efficiency Metrics | Metric | Description | Ideal Value | Why It Matters | |--------|-------------|-------------|----------------| | **Fertility** | Tokens per word | 1.0 | Lower fertility = fewer tokens = faster inference & lower cost | | **Compression Ratio** | Bytes per token | Higher is better | Better compression = more efficient encoding | | **Chars/Token** | Characters per token | Higher is better | More characters per token = better vocabulary utilization | ### Coverage Metrics | Metric | Description | Ideal Value | Why It Matters | |--------|-------------|-------------|----------------| | **OOV Rate** | Out-of-vocabulary percentage | 0% | Lower OOV = better vocabulary coverage | | **STRR** | Single Token Retention Rate | Higher is better | More words preserved as single tokens = better semantic boundaries | | **Continued Words Ratio** | Words split into multiple tokens | Lower is better | Fewer splits = better word boundary preservation | ### Arabic-Specific Metrics | Metric | Description | Why It Matters | |--------|-------------|----------------| | **Arabic Fertility** | Tokens per Arabic word | Arabic-specific efficiency measure | | **Diacritic Preservation** | Whether tashkeel is preserved | Important for religious & educational texts | ### Scoring Formula (Leaderboard) ``` Score = (Fertility Score × 0.45) + (Compression Score × 0.35) + (UNK Score × 0.20) × 100 ``` Where: - **Fertility Score** = 2.0 / fertility (capped 0-1, inverted - lower fertility = higher score) - **Compression Score** = compression / 6 (capped 0-1) - **UNK Score** = 1 - (unk_ratio × 20) (capped 0-1, inverted) ### Research Background These metrics are based on recent research including: - *"A Comprehensive Analysis of Various Tokenizers for Arabic LLMs"* (2024) - *"Evaluating Various Tokenizers for Arabic Text Classification"* (Alyafeai et al.) - *"Beyond Fertility: STRR as a Metric for Multilingual Tokenization"* (2025) - *"Arabic Stable LM: Adapting Stable LM to Arabic"* (2024) """) # ===== TAB 5: Submit Tokenizer ===== with gr.TabItem("🚀 Submit", id="submit"): gr.Markdown(""" ## 🚀 Submit Your Tokenizer Evaluate any HuggingFace tokenizer on **all 8 Arabic datasets** and see how it compares. """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Model Information") submit_model_id = gr.Textbox( label="HuggingFace Model ID *", placeholder="e.g., google/gemma-2-9b", info="The model ID from HuggingFace Hub" ) submit_model_name = gr.Textbox( label="Display Name (optional)", placeholder="e.g., My Custom Tokenizer", info="Leave empty to use model name" ) submit_organization = gr.Textbox( label="Organization (optional)", placeholder="e.g., My Organization", info="Leave empty to auto-detect" ) submit_model_type = gr.Dropdown( choices=[ "Arabic LLM", "Arabic BERT", "Arabic Tokenizer", "Multilingual LLM", "Custom" ], value="Custom", label="Model Type" ) submit_btn = gr.Button("🚀 Evaluate Tokenizer", variant="primary", size="lg") submit_status = gr.Markdown("") with gr.Column(scale=2): gr.Markdown("### Evaluation Results") submit_results = gr.HTML() submit_btn.click( fn=evaluate_submitted_tokenizer, inputs=[submit_model_id, submit_model_name, submit_organization, submit_model_type], outputs=[submit_results, submit_status] ) gr.Markdown(""" --- ### 📋 Submission Guidelines - **Model ID**: Must be a valid HuggingFace model ID (e.g., `organization/model-name`) - **Tokenizer**: The model must have a tokenizer that can be loaded with `AutoTokenizer` - **Public Models**: Only public models on HuggingFace Hub are supported - **Evaluation**: Your tokenizer will be evaluated on all 8 Arabic datasets (~36,000+ samples) ### 💡 Tips - Lower fertility scores indicate better Arabic tokenization efficiency - Compare your results with the leaderboard to see how your tokenizer ranks """) # ===== TAB 6: About ===== with gr.TabItem("ℹ️ About", id="about"): about_html = generate_about_html( tokenizers_by_type, len(available_tokenizers) ) gr.HTML(about_html) # Load cached leaderboard results on page load (fast) demo.load( fn=get_cached_leaderboard, inputs=[], outputs=[leaderboard_output, per_dataset_output, status_output] ) return demo # ============================================================================ # MAIN # ============================================================================ if __name__ == "__main__": demo = create_interface() demo.launch()