HeshamHaroon Claude commited on
Commit
532fc72
·
1 Parent(s): 1ea35a4

Add leaderboard caching and fix dataset configurations

Browse files

- Add caching system to avoid re-evaluating on every page load
- Fix ArabicMMLU: add required subset "All" config
- Replace deprecated ArSenTD-LEV with arbml/ASTD (Egyptian dialect)
- Fix Arabic Sentiment: correct text column to "tweet"
- Fix SANAD: use default config with "Article" column
- Add "Re-evaluate All" button for manual refresh

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (3) hide show
  1. app.py +13 -5
  2. config.py +11 -11
  3. leaderboard.py +52 -3
app.py CHANGED
@@ -13,7 +13,7 @@ from config import SAMPLE_TEXTS, LEADERBOARD_DATASETS
13
  from styles import CUSTOM_CSS
14
  from tokenizer_manager import tokenizer_manager
15
  from analysis import analyze_single_tokenizer, compare_tokenizers
16
- from leaderboard import run_leaderboard_evaluation, evaluate_submitted_tokenizer
17
  from ui_components import generate_about_html
18
 
19
 
@@ -136,7 +136,9 @@ def create_interface():
136
  All tokenizers evaluated on **all 8 Arabic datasets** from HuggingFace (~36,000+ samples total).
137
  """)
138
 
139
- status_output = gr.Markdown("⏳ Loading evaluation...")
 
 
140
 
141
  gr.Markdown("### 📊 Leaderboard Results")
142
  leaderboard_output = gr.HTML()
@@ -144,6 +146,12 @@ def create_interface():
144
  gr.Markdown("### 📈 Per-Dataset Breakdown")
145
  per_dataset_output = gr.HTML()
146
 
 
 
 
 
 
 
147
  gr.Markdown("""
148
  ---
149
  ### 📖 Evaluation Datasets
@@ -151,7 +159,7 @@ def create_interface():
151
  | Dataset | Category | Samples |
152
  |---------|----------|---------|
153
  | ArabicMMLU | MSA Benchmark | 5,000 |
154
- | ArSenTD-LEV | Levantine Dialect | 4,000 |
155
  | ATHAR | Classical Arabic | 5,000 |
156
  | ARCD | QA Dataset | 1,395 |
157
  | Ashaar | Poetry | 5,000 |
@@ -287,9 +295,9 @@ def create_interface():
287
  )
288
  gr.HTML(about_html)
289
 
290
- # Auto-run leaderboard evaluation on load
291
  demo.load(
292
- fn=run_leaderboard_evaluation,
293
  inputs=[],
294
  outputs=[leaderboard_output, per_dataset_output, status_output]
295
  )
 
13
  from styles import CUSTOM_CSS
14
  from tokenizer_manager import tokenizer_manager
15
  from analysis import analyze_single_tokenizer, compare_tokenizers
16
+ from leaderboard import run_leaderboard_evaluation, evaluate_submitted_tokenizer, get_cached_leaderboard
17
  from ui_components import generate_about_html
18
 
19
 
 
136
  All tokenizers evaluated on **all 8 Arabic datasets** from HuggingFace (~36,000+ samples total).
137
  """)
138
 
139
+ with gr.Row():
140
+ status_output = gr.Markdown("⏳ Loading cached results...")
141
+ re_evaluate_btn = gr.Button("🔄 Re-evaluate All", variant="secondary", size="sm")
142
 
143
  gr.Markdown("### 📊 Leaderboard Results")
144
  leaderboard_output = gr.HTML()
 
146
  gr.Markdown("### 📈 Per-Dataset Breakdown")
147
  per_dataset_output = gr.HTML()
148
 
149
+ re_evaluate_btn.click(
150
+ fn=run_leaderboard_evaluation,
151
+ inputs=[],
152
+ outputs=[leaderboard_output, per_dataset_output, status_output]
153
+ )
154
+
155
  gr.Markdown("""
156
  ---
157
  ### 📖 Evaluation Datasets
 
159
  | Dataset | Category | Samples |
160
  |---------|----------|---------|
161
  | ArabicMMLU | MSA Benchmark | 5,000 |
162
+ | ASTD | Egyptian Dialect | 5,000 |
163
  | ATHAR | Classical Arabic | 5,000 |
164
  | ARCD | QA Dataset | 1,395 |
165
  | Ashaar | Poetry | 5,000 |
 
295
  )
296
  gr.HTML(about_html)
297
 
298
+ # Load cached leaderboard results on page load (fast)
299
  demo.load(
300
+ fn=get_cached_leaderboard,
301
  inputs=[],
302
  outputs=[leaderboard_output, per_dataset_output, status_output]
303
  )
config.py CHANGED
@@ -456,19 +456,19 @@ LEADERBOARD_DATASETS = {
456
  "category": "MSA Benchmark",
457
  "text_column": "Question",
458
  "split": "test",
459
- "subset": None,
460
  "samples": 5000,
461
  "description": "Multi-task benchmark from Arab school exams"
462
  },
463
- "arsentd_lev": {
464
- "hf_id": "ramybaly/arsentd_lev",
465
- "name": "ArSenTD-LEV",
466
- "category": "Levantine Dialect",
467
- "text_column": "Tweet",
468
  "split": "train",
469
  "subset": None,
470
- "samples": 4000,
471
- "description": "Levantine Arabic tweets"
472
  },
473
  "athar": {
474
  "hf_id": "mohamed-khalil/ATHAR",
@@ -514,7 +514,7 @@ LEADERBOARD_DATASETS = {
514
  "hf_id": "arbml/Arabic_Sentiment_Twitter_Corpus",
515
  "name": "Arabic Sentiment",
516
  "category": "Social Media",
517
- "text_column": "text",
518
  "split": "train",
519
  "subset": None,
520
  "samples": 5000,
@@ -524,9 +524,9 @@ LEADERBOARD_DATASETS = {
524
  "hf_id": "arbml/SANAD",
525
  "name": "SANAD News",
526
  "category": "News",
527
- "text_column": "text",
528
  "split": "train",
529
- "subset": "alarabiya",
530
  "samples": 5000,
531
  "description": "Arabic news articles"
532
  },
 
456
  "category": "MSA Benchmark",
457
  "text_column": "Question",
458
  "split": "test",
459
+ "subset": "All",
460
  "samples": 5000,
461
  "description": "Multi-task benchmark from Arab school exams"
462
  },
463
+ "astd": {
464
+ "hf_id": "arbml/ASTD",
465
+ "name": "ASTD (Egyptian)",
466
+ "category": "Egyptian Dialect",
467
+ "text_column": "tweet",
468
  "split": "train",
469
  "subset": None,
470
+ "samples": 5000,
471
+ "description": "Egyptian Arabic sentiment tweets"
472
  },
473
  "athar": {
474
  "hf_id": "mohamed-khalil/ATHAR",
 
514
  "hf_id": "arbml/Arabic_Sentiment_Twitter_Corpus",
515
  "name": "Arabic Sentiment",
516
  "category": "Social Media",
517
+ "text_column": "tweet",
518
  "split": "train",
519
  "subset": None,
520
  "samples": 5000,
 
524
  "hf_id": "arbml/SANAD",
525
  "name": "SANAD News",
526
  "category": "News",
527
+ "text_column": "Article",
528
  "split": "train",
529
+ "subset": None,
530
  "samples": 5000,
531
  "description": "Arabic news articles"
532
  },
leaderboard.py CHANGED
@@ -20,6 +20,9 @@ from tokenizer_manager import tokenizer_manager
20
  # File path for persistent storage of submitted tokenizers
21
  SUBMISSIONS_FILE = os.path.join(os.path.dirname(__file__), "submissions.json")
22
 
 
 
 
23
 
24
  def load_submitted_tokenizers() -> Dict[str, Dict]:
25
  """Load submitted tokenizers from persistent storage"""
@@ -43,6 +46,31 @@ def save_submitted_tokenizer(model_id: str, data: Dict) -> None:
43
  print(f"Warning: Could not save submission: {e}")
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  class HFDatasetLoader:
47
  """Load Arabic datasets from HuggingFace"""
48
 
@@ -159,6 +187,23 @@ def calculate_leaderboard_score(fertility: float, compression: float, unk_ratio:
159
  return round(score, 1)
160
 
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  def run_leaderboard_evaluation(
163
  progress=gr.Progress()
164
  ) -> Tuple[str, str, str]:
@@ -309,10 +354,14 @@ def run_leaderboard_evaluation(
309
  # Create HTML tables
310
  leaderboard_html = generate_leaderboard_html(leaderboard_data)
311
  per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)
312
-
313
  status_lines.append(f"\n✅ **Evaluation Complete!** Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")
314
-
315
- return leaderboard_html, per_dataset_html, "\n".join(status_lines)
 
 
 
 
316
 
317
 
318
  def generate_leaderboard_html(data: List[Dict]) -> str:
 
20
  # File path for persistent storage of submitted tokenizers
21
  SUBMISSIONS_FILE = os.path.join(os.path.dirname(__file__), "submissions.json")
22
 
23
+ # File path for cached leaderboard results
24
+ LEADERBOARD_CACHE_FILE = os.path.join(os.path.dirname(__file__), "leaderboard_cache.json")
25
+
26
 
27
  def load_submitted_tokenizers() -> Dict[str, Dict]:
28
  """Load submitted tokenizers from persistent storage"""
 
46
  print(f"Warning: Could not save submission: {e}")
47
 
48
 
49
+ def load_leaderboard_cache() -> Optional[Dict]:
50
+ """Load cached leaderboard results"""
51
+ if os.path.exists(LEADERBOARD_CACHE_FILE):
52
+ try:
53
+ with open(LEADERBOARD_CACHE_FILE, 'r', encoding='utf-8') as f:
54
+ return json.load(f)
55
+ except (json.JSONDecodeError, IOError):
56
+ return None
57
+ return None
58
+
59
+
60
+ def save_leaderboard_cache(leaderboard_html: str, per_dataset_html: str, status: str) -> None:
61
+ """Save leaderboard results to cache"""
62
+ cache_data = {
63
+ "leaderboard_html": leaderboard_html,
64
+ "per_dataset_html": per_dataset_html,
65
+ "status": status
66
+ }
67
+ try:
68
+ with open(LEADERBOARD_CACHE_FILE, 'w', encoding='utf-8') as f:
69
+ json.dump(cache_data, f, ensure_ascii=False)
70
+ except IOError as e:
71
+ print(f"Warning: Could not save leaderboard cache: {e}")
72
+
73
+
74
  class HFDatasetLoader:
75
  """Load Arabic datasets from HuggingFace"""
76
 
 
187
  return round(score, 1)
188
 
189
 
190
+ def get_cached_leaderboard(progress=gr.Progress()) -> Tuple[str, str, str]:
191
+ """
192
+ Get leaderboard results from cache if available.
193
+ If no cache exists, runs evaluation and caches results.
194
+ Returns: (leaderboard_html, per_dataset_html, status_message)
195
+ """
196
+ cache = load_leaderboard_cache()
197
+ if cache:
198
+ return (
199
+ cache.get("leaderboard_html", ""),
200
+ cache.get("per_dataset_html", ""),
201
+ cache.get("status", "") + "\n\n📦 *Loaded from cache. Click 'Re-evaluate All' to refresh.*"
202
+ )
203
+ # No cache exists, run evaluation (first time only)
204
+ return run_leaderboard_evaluation(progress)
205
+
206
+
207
  def run_leaderboard_evaluation(
208
  progress=gr.Progress()
209
  ) -> Tuple[str, str, str]:
 
354
  # Create HTML tables
355
  leaderboard_html = generate_leaderboard_html(leaderboard_data)
356
  per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)
357
+
358
  status_lines.append(f"\n✅ **Evaluation Complete!** Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")
359
+ status_message = "\n".join(status_lines)
360
+
361
+ # Save results to cache
362
+ save_leaderboard_cache(leaderboard_html, per_dataset_html, status_message)
363
+
364
+ return leaderboard_html, per_dataset_html, status_message
365
 
366
 
367
  def generate_leaderboard_html(data: List[Dict]) -> str: