Spaces:

HeshamHaroon
/

Arabic_Tokenizer

Running

HeshamHaroon Claude commited on 18 days ago

Commit

532fc72

1 Parent(s): 1ea35a4

Add leaderboard caching and fix dataset configurations

- Add caching system to avoid re-evaluating on every page load
- Fix ArabicMMLU: add required subset "All" config
- Replace deprecated ArSenTD-LEV with arbml/ASTD (Egyptian dialect)
- Fix Arabic Sentiment: correct text column to "tweet"
- Fix SANAD: use default config with "Article" column
- Add "Re-evaluate All" button for manual refresh

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (3) hide show

app.py +13 -5
config.py +11 -11
leaderboard.py +52 -3

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from config import SAMPLE_TEXTS, LEADERBOARD_DATASETS
 from styles import CUSTOM_CSS
 from tokenizer_manager import tokenizer_manager
 from analysis import analyze_single_tokenizer, compare_tokenizers
-from leaderboard import run_leaderboard_evaluation, evaluate_submitted_tokenizer
 from ui_components import generate_about_html
@@ -136,7 +136,9 @@ def create_interface():
                 All tokenizers evaluated on **all 8 Arabic datasets** from HuggingFace (~36,000+ samples total).
                 """)
-                status_output = gr.Markdown("⏳ Loading evaluation...")
                 gr.Markdown("### 📊 Leaderboard Results")
                 leaderboard_output = gr.HTML()
@@ -144,6 +146,12 @@ def create_interface():
                 gr.Markdown("### 📈 Per-Dataset Breakdown")
                 per_dataset_output = gr.HTML()
                 gr.Markdown("""
                 ---
                 ### 📖 Evaluation Datasets
@@ -151,7 +159,7 @@ def create_interface():
                 | Dataset | Category | Samples |
                 |---------|----------|---------|
                 | ArabicMMLU | MSA Benchmark | 5,000 |
-                | ArSenTD-LEV | Levantine Dialect | 4,000 |
                 | ATHAR | Classical Arabic | 5,000 |
                 | ARCD | QA Dataset | 1,395 |
                 | Ashaar | Poetry | 5,000 |
@@ -287,9 +295,9 @@ def create_interface():
                 )
                 gr.HTML(about_html)
-        # Auto-run leaderboard evaluation on load
         demo.load(
-            fn=run_leaderboard_evaluation,
             inputs=[],
             outputs=[leaderboard_output, per_dataset_output, status_output]
         )

 from styles import CUSTOM_CSS
 from tokenizer_manager import tokenizer_manager
 from analysis import analyze_single_tokenizer, compare_tokenizers
+from leaderboard import run_leaderboard_evaluation, evaluate_submitted_tokenizer, get_cached_leaderboard
 from ui_components import generate_about_html
                 All tokenizers evaluated on **all 8 Arabic datasets** from HuggingFace (~36,000+ samples total).
                 """)
+                with gr.Row():
+                    status_output = gr.Markdown("⏳ Loading cached results...")
+                    re_evaluate_btn = gr.Button("🔄 Re-evaluate All", variant="secondary", size="sm")
                 gr.Markdown("### 📊 Leaderboard Results")
                 leaderboard_output = gr.HTML()
                 gr.Markdown("### 📈 Per-Dataset Breakdown")
                 per_dataset_output = gr.HTML()
+                re_evaluate_btn.click(
+                    fn=run_leaderboard_evaluation,
+                    inputs=[],
+                    outputs=[leaderboard_output, per_dataset_output, status_output]
+                )
                 gr.Markdown("""
                 ---
                 ### 📖 Evaluation Datasets
                 | Dataset | Category | Samples |
                 |---------|----------|---------|
                 | ArabicMMLU | MSA Benchmark | 5,000 |
+                | ASTD | Egyptian Dialect | 5,000 |
                 | ATHAR | Classical Arabic | 5,000 |
                 | ARCD | QA Dataset | 1,395 |
                 | Ashaar | Poetry | 5,000 |
                 )
                 gr.HTML(about_html)
+        # Load cached leaderboard results on page load (fast)
         demo.load(
+            fn=get_cached_leaderboard,
             inputs=[],
             outputs=[leaderboard_output, per_dataset_output, status_output]
         )

config.py CHANGED Viewed

@@ -456,19 +456,19 @@ LEADERBOARD_DATASETS = {
         "category": "MSA Benchmark",
         "text_column": "Question",
         "split": "test",
-        "subset": None,
         "samples": 5000,
         "description": "Multi-task benchmark from Arab school exams"
     },
-    "arsentd_lev": {
-        "hf_id": "ramybaly/arsentd_lev",
-        "name": "ArSenTD-LEV",
-        "category": "Levantine Dialect",
-        "text_column": "Tweet",
         "split": "train",
         "subset": None,
-        "samples": 4000,
-        "description": "Levantine Arabic tweets"
     },
     "athar": {
         "hf_id": "mohamed-khalil/ATHAR",
@@ -514,7 +514,7 @@ LEADERBOARD_DATASETS = {
         "hf_id": "arbml/Arabic_Sentiment_Twitter_Corpus",
         "name": "Arabic Sentiment",
         "category": "Social Media",
-        "text_column": "text",
         "split": "train",
         "subset": None,
         "samples": 5000,
@@ -524,9 +524,9 @@ LEADERBOARD_DATASETS = {
         "hf_id": "arbml/SANAD",
         "name": "SANAD News",
         "category": "News",
-        "text_column": "text",
         "split": "train",
-        "subset": "alarabiya",
         "samples": 5000,
         "description": "Arabic news articles"
     },

         "category": "MSA Benchmark",
         "text_column": "Question",
         "split": "test",
+        "subset": "All",
         "samples": 5000,
         "description": "Multi-task benchmark from Arab school exams"
     },
+    "astd": {
+        "hf_id": "arbml/ASTD",
+        "name": "ASTD (Egyptian)",
+        "category": "Egyptian Dialect",
+        "text_column": "tweet",
         "split": "train",
         "subset": None,
+        "samples": 5000,
+        "description": "Egyptian Arabic sentiment tweets"
     },
     "athar": {
         "hf_id": "mohamed-khalil/ATHAR",
         "hf_id": "arbml/Arabic_Sentiment_Twitter_Corpus",
         "name": "Arabic Sentiment",
         "category": "Social Media",
+        "text_column": "tweet",
         "split": "train",
         "subset": None,
         "samples": 5000,
         "hf_id": "arbml/SANAD",
         "name": "SANAD News",
         "category": "News",
+        "text_column": "Article",
         "split": "train",
+        "subset": None,
         "samples": 5000,
         "description": "Arabic news articles"
     },

leaderboard.py CHANGED Viewed

@@ -20,6 +20,9 @@ from tokenizer_manager import tokenizer_manager
 # File path for persistent storage of submitted tokenizers
 SUBMISSIONS_FILE = os.path.join(os.path.dirname(__file__), "submissions.json")
 def load_submitted_tokenizers() -> Dict[str, Dict]:
     """Load submitted tokenizers from persistent storage"""
@@ -43,6 +46,31 @@ def save_submitted_tokenizer(model_id: str, data: Dict) -> None:
         print(f"Warning: Could not save submission: {e}")
 class HFDatasetLoader:
     """Load Arabic datasets from HuggingFace"""
@@ -159,6 +187,23 @@ def calculate_leaderboard_score(fertility: float, compression: float, unk_ratio:
     return round(score, 1)
 def run_leaderboard_evaluation(
     progress=gr.Progress()
 ) -> Tuple[str, str, str]:
@@ -309,10 +354,14 @@ def run_leaderboard_evaluation(
     # Create HTML tables
     leaderboard_html = generate_leaderboard_html(leaderboard_data)
     per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)
     status_lines.append(f"\n✅ **Evaluation Complete!** Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")
-    return leaderboard_html, per_dataset_html, "\n".join(status_lines)
 def generate_leaderboard_html(data: List[Dict]) -> str:

 # File path for persistent storage of submitted tokenizers
 SUBMISSIONS_FILE = os.path.join(os.path.dirname(__file__), "submissions.json")
+# File path for cached leaderboard results
+LEADERBOARD_CACHE_FILE = os.path.join(os.path.dirname(__file__), "leaderboard_cache.json")
 def load_submitted_tokenizers() -> Dict[str, Dict]:
     """Load submitted tokenizers from persistent storage"""
         print(f"Warning: Could not save submission: {e}")
+def load_leaderboard_cache() -> Optional[Dict]:
+    """Load cached leaderboard results"""
+    if os.path.exists(LEADERBOARD_CACHE_FILE):
+        try:
+            with open(LEADERBOARD_CACHE_FILE, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except (json.JSONDecodeError, IOError):
+            return None
+    return None
+def save_leaderboard_cache(leaderboard_html: str, per_dataset_html: str, status: str) -> None:
+    """Save leaderboard results to cache"""
+    cache_data = {
+        "leaderboard_html": leaderboard_html,
+        "per_dataset_html": per_dataset_html,
+        "status": status
+    }
+    try:
+        with open(LEADERBOARD_CACHE_FILE, 'w', encoding='utf-8') as f:
+            json.dump(cache_data, f, ensure_ascii=False)
+    except IOError as e:
+        print(f"Warning: Could not save leaderboard cache: {e}")
 class HFDatasetLoader:
     """Load Arabic datasets from HuggingFace"""
     return round(score, 1)
+def get_cached_leaderboard(progress=gr.Progress()) -> Tuple[str, str, str]:
+    """
+    Get leaderboard results from cache if available.
+    If no cache exists, runs evaluation and caches results.
+    Returns: (leaderboard_html, per_dataset_html, status_message)
+    """
+    cache = load_leaderboard_cache()
+    if cache:
+        return (
+            cache.get("leaderboard_html", ""),
+            cache.get("per_dataset_html", ""),
+            cache.get("status", "") + "\n\n📦 *Loaded from cache. Click 'Re-evaluate All' to refresh.*"
+        )
+    # No cache exists, run evaluation (first time only)
+    return run_leaderboard_evaluation(progress)
 def run_leaderboard_evaluation(
     progress=gr.Progress()
 ) -> Tuple[str, str, str]:
     # Create HTML tables
     leaderboard_html = generate_leaderboard_html(leaderboard_data)
     per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)
     status_lines.append(f"\n✅ **Evaluation Complete!** Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")
+    status_message = "\n".join(status_lines)
+    # Save results to cache
+    save_leaderboard_cache(leaderboard_html, per_dataset_html, status_message)
+    return leaderboard_html, per_dataset_html, status_message
 def generate_leaderboard_html(data: List[Dict]) -> str: