Spaces:
Running
Running
Commit
·
532fc72
1
Parent(s):
1ea35a4
Add leaderboard caching and fix dataset configurations
Browse files- Add caching system to avoid re-evaluating on every page load
- Fix ArabicMMLU: add required subset "All" config
- Replace deprecated ArSenTD-LEV with arbml/ASTD (Egyptian dialect)
- Fix Arabic Sentiment: correct text column to "tweet"
- Fix SANAD: use default config with "Article" column
- Add "Re-evaluate All" button for manual refresh
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
- app.py +13 -5
- config.py +11 -11
- leaderboard.py +52 -3
app.py
CHANGED
|
@@ -13,7 +13,7 @@ from config import SAMPLE_TEXTS, LEADERBOARD_DATASETS
|
|
| 13 |
from styles import CUSTOM_CSS
|
| 14 |
from tokenizer_manager import tokenizer_manager
|
| 15 |
from analysis import analyze_single_tokenizer, compare_tokenizers
|
| 16 |
-
from leaderboard import run_leaderboard_evaluation, evaluate_submitted_tokenizer
|
| 17 |
from ui_components import generate_about_html
|
| 18 |
|
| 19 |
|
|
@@ -136,7 +136,9 @@ def create_interface():
|
|
| 136 |
All tokenizers evaluated on **all 8 Arabic datasets** from HuggingFace (~36,000+ samples total).
|
| 137 |
""")
|
| 138 |
|
| 139 |
-
|
|
|
|
|
|
|
| 140 |
|
| 141 |
gr.Markdown("### 📊 Leaderboard Results")
|
| 142 |
leaderboard_output = gr.HTML()
|
|
@@ -144,6 +146,12 @@ def create_interface():
|
|
| 144 |
gr.Markdown("### 📈 Per-Dataset Breakdown")
|
| 145 |
per_dataset_output = gr.HTML()
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
gr.Markdown("""
|
| 148 |
---
|
| 149 |
### 📖 Evaluation Datasets
|
|
@@ -151,7 +159,7 @@ def create_interface():
|
|
| 151 |
| Dataset | Category | Samples |
|
| 152 |
|---------|----------|---------|
|
| 153 |
| ArabicMMLU | MSA Benchmark | 5,000 |
|
| 154 |
-
|
|
| 155 |
| ATHAR | Classical Arabic | 5,000 |
|
| 156 |
| ARCD | QA Dataset | 1,395 |
|
| 157 |
| Ashaar | Poetry | 5,000 |
|
|
@@ -287,9 +295,9 @@ def create_interface():
|
|
| 287 |
)
|
| 288 |
gr.HTML(about_html)
|
| 289 |
|
| 290 |
-
#
|
| 291 |
demo.load(
|
| 292 |
-
fn=
|
| 293 |
inputs=[],
|
| 294 |
outputs=[leaderboard_output, per_dataset_output, status_output]
|
| 295 |
)
|
|
|
|
| 13 |
from styles import CUSTOM_CSS
|
| 14 |
from tokenizer_manager import tokenizer_manager
|
| 15 |
from analysis import analyze_single_tokenizer, compare_tokenizers
|
| 16 |
+
from leaderboard import run_leaderboard_evaluation, evaluate_submitted_tokenizer, get_cached_leaderboard
|
| 17 |
from ui_components import generate_about_html
|
| 18 |
|
| 19 |
|
|
|
|
| 136 |
All tokenizers evaluated on **all 8 Arabic datasets** from HuggingFace (~36,000+ samples total).
|
| 137 |
""")
|
| 138 |
|
| 139 |
+
with gr.Row():
|
| 140 |
+
status_output = gr.Markdown("⏳ Loading cached results...")
|
| 141 |
+
re_evaluate_btn = gr.Button("🔄 Re-evaluate All", variant="secondary", size="sm")
|
| 142 |
|
| 143 |
gr.Markdown("### 📊 Leaderboard Results")
|
| 144 |
leaderboard_output = gr.HTML()
|
|
|
|
| 146 |
gr.Markdown("### 📈 Per-Dataset Breakdown")
|
| 147 |
per_dataset_output = gr.HTML()
|
| 148 |
|
| 149 |
+
re_evaluate_btn.click(
|
| 150 |
+
fn=run_leaderboard_evaluation,
|
| 151 |
+
inputs=[],
|
| 152 |
+
outputs=[leaderboard_output, per_dataset_output, status_output]
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
gr.Markdown("""
|
| 156 |
---
|
| 157 |
### 📖 Evaluation Datasets
|
|
|
|
| 159 |
| Dataset | Category | Samples |
|
| 160 |
|---------|----------|---------|
|
| 161 |
| ArabicMMLU | MSA Benchmark | 5,000 |
|
| 162 |
+
| ASTD | Egyptian Dialect | 5,000 |
|
| 163 |
| ATHAR | Classical Arabic | 5,000 |
|
| 164 |
| ARCD | QA Dataset | 1,395 |
|
| 165 |
| Ashaar | Poetry | 5,000 |
|
|
|
|
| 295 |
)
|
| 296 |
gr.HTML(about_html)
|
| 297 |
|
| 298 |
+
# Load cached leaderboard results on page load (fast)
|
| 299 |
demo.load(
|
| 300 |
+
fn=get_cached_leaderboard,
|
| 301 |
inputs=[],
|
| 302 |
outputs=[leaderboard_output, per_dataset_output, status_output]
|
| 303 |
)
|
config.py
CHANGED
|
@@ -456,19 +456,19 @@ LEADERBOARD_DATASETS = {
|
|
| 456 |
"category": "MSA Benchmark",
|
| 457 |
"text_column": "Question",
|
| 458 |
"split": "test",
|
| 459 |
-
"subset":
|
| 460 |
"samples": 5000,
|
| 461 |
"description": "Multi-task benchmark from Arab school exams"
|
| 462 |
},
|
| 463 |
-
"
|
| 464 |
-
"hf_id": "
|
| 465 |
-
"name": "
|
| 466 |
-
"category": "
|
| 467 |
-
"text_column": "
|
| 468 |
"split": "train",
|
| 469 |
"subset": None,
|
| 470 |
-
"samples":
|
| 471 |
-
"description": "
|
| 472 |
},
|
| 473 |
"athar": {
|
| 474 |
"hf_id": "mohamed-khalil/ATHAR",
|
|
@@ -514,7 +514,7 @@ LEADERBOARD_DATASETS = {
|
|
| 514 |
"hf_id": "arbml/Arabic_Sentiment_Twitter_Corpus",
|
| 515 |
"name": "Arabic Sentiment",
|
| 516 |
"category": "Social Media",
|
| 517 |
-
"text_column": "
|
| 518 |
"split": "train",
|
| 519 |
"subset": None,
|
| 520 |
"samples": 5000,
|
|
@@ -524,9 +524,9 @@ LEADERBOARD_DATASETS = {
|
|
| 524 |
"hf_id": "arbml/SANAD",
|
| 525 |
"name": "SANAD News",
|
| 526 |
"category": "News",
|
| 527 |
-
"text_column": "
|
| 528 |
"split": "train",
|
| 529 |
-
"subset":
|
| 530 |
"samples": 5000,
|
| 531 |
"description": "Arabic news articles"
|
| 532 |
},
|
|
|
|
| 456 |
"category": "MSA Benchmark",
|
| 457 |
"text_column": "Question",
|
| 458 |
"split": "test",
|
| 459 |
+
"subset": "All",
|
| 460 |
"samples": 5000,
|
| 461 |
"description": "Multi-task benchmark from Arab school exams"
|
| 462 |
},
|
| 463 |
+
"astd": {
|
| 464 |
+
"hf_id": "arbml/ASTD",
|
| 465 |
+
"name": "ASTD (Egyptian)",
|
| 466 |
+
"category": "Egyptian Dialect",
|
| 467 |
+
"text_column": "tweet",
|
| 468 |
"split": "train",
|
| 469 |
"subset": None,
|
| 470 |
+
"samples": 5000,
|
| 471 |
+
"description": "Egyptian Arabic sentiment tweets"
|
| 472 |
},
|
| 473 |
"athar": {
|
| 474 |
"hf_id": "mohamed-khalil/ATHAR",
|
|
|
|
| 514 |
"hf_id": "arbml/Arabic_Sentiment_Twitter_Corpus",
|
| 515 |
"name": "Arabic Sentiment",
|
| 516 |
"category": "Social Media",
|
| 517 |
+
"text_column": "tweet",
|
| 518 |
"split": "train",
|
| 519 |
"subset": None,
|
| 520 |
"samples": 5000,
|
|
|
|
| 524 |
"hf_id": "arbml/SANAD",
|
| 525 |
"name": "SANAD News",
|
| 526 |
"category": "News",
|
| 527 |
+
"text_column": "Article",
|
| 528 |
"split": "train",
|
| 529 |
+
"subset": None,
|
| 530 |
"samples": 5000,
|
| 531 |
"description": "Arabic news articles"
|
| 532 |
},
|
leaderboard.py
CHANGED
|
@@ -20,6 +20,9 @@ from tokenizer_manager import tokenizer_manager
|
|
| 20 |
# File path for persistent storage of submitted tokenizers
|
| 21 |
SUBMISSIONS_FILE = os.path.join(os.path.dirname(__file__), "submissions.json")
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
def load_submitted_tokenizers() -> Dict[str, Dict]:
|
| 25 |
"""Load submitted tokenizers from persistent storage"""
|
|
@@ -43,6 +46,31 @@ def save_submitted_tokenizer(model_id: str, data: Dict) -> None:
|
|
| 43 |
print(f"Warning: Could not save submission: {e}")
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
class HFDatasetLoader:
|
| 47 |
"""Load Arabic datasets from HuggingFace"""
|
| 48 |
|
|
@@ -159,6 +187,23 @@ def calculate_leaderboard_score(fertility: float, compression: float, unk_ratio:
|
|
| 159 |
return round(score, 1)
|
| 160 |
|
| 161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
def run_leaderboard_evaluation(
|
| 163 |
progress=gr.Progress()
|
| 164 |
) -> Tuple[str, str, str]:
|
|
@@ -309,10 +354,14 @@ def run_leaderboard_evaluation(
|
|
| 309 |
# Create HTML tables
|
| 310 |
leaderboard_html = generate_leaderboard_html(leaderboard_data)
|
| 311 |
per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)
|
| 312 |
-
|
| 313 |
status_lines.append(f"\n✅ **Evaluation Complete!** Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")
|
| 314 |
-
|
| 315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
|
| 318 |
def generate_leaderboard_html(data: List[Dict]) -> str:
|
|
|
|
| 20 |
# File path for persistent storage of submitted tokenizers
|
| 21 |
SUBMISSIONS_FILE = os.path.join(os.path.dirname(__file__), "submissions.json")
|
| 22 |
|
| 23 |
+
# File path for cached leaderboard results
|
| 24 |
+
LEADERBOARD_CACHE_FILE = os.path.join(os.path.dirname(__file__), "leaderboard_cache.json")
|
| 25 |
+
|
| 26 |
|
| 27 |
def load_submitted_tokenizers() -> Dict[str, Dict]:
|
| 28 |
"""Load submitted tokenizers from persistent storage"""
|
|
|
|
| 46 |
print(f"Warning: Could not save submission: {e}")
|
| 47 |
|
| 48 |
|
| 49 |
+
def load_leaderboard_cache() -> Optional[Dict]:
|
| 50 |
+
"""Load cached leaderboard results"""
|
| 51 |
+
if os.path.exists(LEADERBOARD_CACHE_FILE):
|
| 52 |
+
try:
|
| 53 |
+
with open(LEADERBOARD_CACHE_FILE, 'r', encoding='utf-8') as f:
|
| 54 |
+
return json.load(f)
|
| 55 |
+
except (json.JSONDecodeError, IOError):
|
| 56 |
+
return None
|
| 57 |
+
return None
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def save_leaderboard_cache(leaderboard_html: str, per_dataset_html: str, status: str) -> None:
|
| 61 |
+
"""Save leaderboard results to cache"""
|
| 62 |
+
cache_data = {
|
| 63 |
+
"leaderboard_html": leaderboard_html,
|
| 64 |
+
"per_dataset_html": per_dataset_html,
|
| 65 |
+
"status": status
|
| 66 |
+
}
|
| 67 |
+
try:
|
| 68 |
+
with open(LEADERBOARD_CACHE_FILE, 'w', encoding='utf-8') as f:
|
| 69 |
+
json.dump(cache_data, f, ensure_ascii=False)
|
| 70 |
+
except IOError as e:
|
| 71 |
+
print(f"Warning: Could not save leaderboard cache: {e}")
|
| 72 |
+
|
| 73 |
+
|
| 74 |
class HFDatasetLoader:
|
| 75 |
"""Load Arabic datasets from HuggingFace"""
|
| 76 |
|
|
|
|
| 187 |
return round(score, 1)
|
| 188 |
|
| 189 |
|
| 190 |
+
def get_cached_leaderboard(progress=gr.Progress()) -> Tuple[str, str, str]:
|
| 191 |
+
"""
|
| 192 |
+
Get leaderboard results from cache if available.
|
| 193 |
+
If no cache exists, runs evaluation and caches results.
|
| 194 |
+
Returns: (leaderboard_html, per_dataset_html, status_message)
|
| 195 |
+
"""
|
| 196 |
+
cache = load_leaderboard_cache()
|
| 197 |
+
if cache:
|
| 198 |
+
return (
|
| 199 |
+
cache.get("leaderboard_html", ""),
|
| 200 |
+
cache.get("per_dataset_html", ""),
|
| 201 |
+
cache.get("status", "") + "\n\n📦 *Loaded from cache. Click 'Re-evaluate All' to refresh.*"
|
| 202 |
+
)
|
| 203 |
+
# No cache exists, run evaluation (first time only)
|
| 204 |
+
return run_leaderboard_evaluation(progress)
|
| 205 |
+
|
| 206 |
+
|
| 207 |
def run_leaderboard_evaluation(
|
| 208 |
progress=gr.Progress()
|
| 209 |
) -> Tuple[str, str, str]:
|
|
|
|
| 354 |
# Create HTML tables
|
| 355 |
leaderboard_html = generate_leaderboard_html(leaderboard_data)
|
| 356 |
per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)
|
| 357 |
+
|
| 358 |
status_lines.append(f"\n✅ **Evaluation Complete!** Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")
|
| 359 |
+
status_message = "\n".join(status_lines)
|
| 360 |
+
|
| 361 |
+
# Save results to cache
|
| 362 |
+
save_leaderboard_cache(leaderboard_html, per_dataset_html, status_message)
|
| 363 |
+
|
| 364 |
+
return leaderboard_html, per_dataset_html, status_message
|
| 365 |
|
| 366 |
|
| 367 |
def generate_leaderboard_html(data: List[Dict]) -> str:
|