Spaces:

McGill-NLP
/

msteb_leaderboard

Running

App Files Files Community

vivekvermaiit commited on Aug 7

Commit

8fc70f8

1 Parent(s): fed47e0

json file in results

Browse files

Files changed (9) hide show

app.py +13 -26
notes.txt +12 -0
src/about.py +1 -1
src/display/utils.py +13 -0
src/result_samples/speech.json +0 -0
src/result_samples/text.json +0 -0
src/submission/submit.py +100 -164
src/submission_samples/model_name_speech.csv +1 -1
src/submission_samples/model_name_text.csv +1 -1

app.py CHANGED Viewed

@@ -25,46 +25,47 @@ from src.display.utils import (
     ModelType,
     fields,
     WeightType,
-    Precision
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import handle_csv_submission
-from pathlib import Path
 text_sample_path = "src/submission_samples/model_name_text.csv"
 speech_sample_path = "src/submission_samples/model_name_speech.csv"
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
-# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 (
     finished_eval_queue_df,
     running_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe,result_type='text'):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     column_class = AutoEvalColumn if result_type == "text" else AutoEvalColumnSpeech
@@ -98,19 +99,6 @@ def init_leaderboard(dataframe,result_type='text'):
         interactive=False,
     )
-REGION_MAP = {
-    "All": "All",
-    "Africa": "Africa",
-    "Americas/Oceania": "Americas_Oceania",
-    "Asia (S)": "Asia_S",
-    "Asia (SE)": "Asia_SE",
-    "Asia (W, C)": "Asia_W_C",
-    "Asia (E)": "Asia_E",
-    "Europe (W, N, S)": "Europe_W_N_S",
-    "Europe (E)": "Europe_E",
-}
-REGIONS = ["All", "Africa", "Americas_Oceania", "Asia_S", "Asia_SE", "Asia_W_C", "Asia_E", "Europe_W_N_S", "Europe_E"]
 leaderboard_dataframes = {
     region: get_leaderboard_df(
@@ -121,7 +109,7 @@ leaderboard_dataframes = {
         region if region != "All" else None,
         result_type="text"
     )
-    for region in REGIONS
 }
 leaderboard_dataframes_speech = {
@@ -133,7 +121,7 @@ leaderboard_dataframes_speech = {
         region if region != "All" else None,
         result_type="speech"
     )
-    for region in REGIONS
 }
 # Preload leaderboard blocks
 js_switch_code = """
@@ -164,7 +152,6 @@ js_switch_code = """
 }
 """
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
@@ -205,7 +192,7 @@ with demo:
                         elem_id=f"speech-leaderboard-{region_key}",
                         elem_classes=["visible"] if region_key == "All" else []
                 ):
-                    init_leaderboard(leaderboard_dataframes_speech[region_key],result_type='speech')
             speech_region_dropdown.change(
                 None,
@@ -267,4 +254,4 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

     ModelType,
     fields,
     WeightType,
+    Precision, REGION_MAP
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import handle_csv_submission
 text_sample_path = "src/submission_samples/model_name_text.csv"
 speech_sample_path = "src/submission_samples/model_name_speech.csv"
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
+        token=TOKEN
     )
 except Exception:
     restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
+        token=TOKEN
     )
 except Exception:
     restart_space()
 (
     finished_eval_queue_df,
     running_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+def init_leaderboard(dataframe, result_type='text'):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     column_class = AutoEvalColumn if result_type == "text" else AutoEvalColumnSpeech
         interactive=False,
     )
 leaderboard_dataframes = {
     region: get_leaderboard_df(
         region if region != "All" else None,
         result_type="text"
     )
+    for region in REGION_MAP.values()
 }
 leaderboard_dataframes_speech = {
         region if region != "All" else None,
         result_type="speech"
     )
+    for region in REGION_MAP.values()
 }
 # Preload leaderboard blocks
 js_switch_code = """
 }
 """
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
                         elem_id=f"speech-leaderboard-{region_key}",
                         elem_classes=["visible"] if region_key == "All" else []
                 ):
+                    init_leaderboard(leaderboard_dataframes_speech[region_key], result_type='speech')
             speech_region_dropdown.change(
                 None,
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

notes.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+The submission flow for the project is as follows:
+When a csv is submitted, we store the csv in the msteb requests dataset in the folder that's appropriate
+based on text or speech results.
+Then this csv result is in the same flow converted into a json file and uploaded to results dataset.
+This helps the leaderboard parse those results and display it.
+There are validation checks for the csv being formatted correctly and that at least one result value is present.

src/about.py CHANGED Viewed

@@ -51,7 +51,7 @@ https://github.com/McGill-NLP/mSTEB
 EVALUATION_QUEUE_TEXT = """
 ## Submit your results
-Please select the csv file and result type to upload your evaluation results for mSTEB.
 Kindly format the results in the same way as provided in the sample csv files below.
 """

 EVALUATION_QUEUE_TEXT = """
 ## Submit your results
+Please provide the model name, csv file and select the appropriate result type to upload your evaluation results for mSTEB.
 Kindly format the results in the same way as provided in the sample csv files below.
 """

src/display/utils.py CHANGED Viewed

@@ -123,3 +123,16 @@ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 SPEECH_BENCHMARK_COLS = [t.value.col_name for t in SpeechTasks]

 BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 SPEECH_BENCHMARK_COLS = [t.value.col_name for t in SpeechTasks]
+REGION_MAP = {
+    "All": "All",
+    "Africa": "Africa",
+    "Americas/Oceania": "Americas_Oceania",
+    "Asia (S)": "Asia_S",
+    "Asia (SE)": "Asia_SE",
+    "Asia (W, C)": "Asia_W_C",
+    "Asia (E)": "Asia_E",
+    "Europe (W, N, S)": "Europe_W_N_S",
+    "Europe (E)": "Europe_E",
+}

src/result_samples/speech.json ADDED Viewed

File without changes

src/result_samples/text.json ADDED Viewed

File without changes

src/submission/submit.py CHANGED Viewed

@@ -3,121 +3,14 @@ import os
 import pandas as pd
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
-from src.submission.check_validity import (
-    already_submitted_models,
-    check_model_card,
-    get_model_size,
-    is_model_on_hub,
-)
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
-# def add_new_eval(
-#     model: str,
-#     base_model: str,
-#     revision: str,
-#     precision: str,
-#     weight_type: str,
-#     model_type: str,
-# ):
-#     global REQUESTED_MODELS
-#     global USERS_TO_SUBMISSION_DATES
-#     if not REQUESTED_MODELS:
-#         REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
-#
-#     user_name = ""
-#     model_path = model
-#     if "/" in model:
-#         user_name = model.split("/")[0]
-#         model_path = model.split("/")[1]
-#
-#     precision = precision.split(" ")[0]
-#     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-#
-#     if model_type is None or model_type == "":
-#         return styled_error("Please select a model type.")
-#
-#     # Does the model actually exist?
-#     if revision == "":
-#         revision = "main"
-#
-#     # Is the model on the hub?
-#     if weight_type in ["Delta", "Adapter"]:
-#         base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
-#         if not base_model_on_hub:
-#             return styled_error(f'Base model "{base_model}" {error}')
-#
-#     if not weight_type == "Adapter":
-#         model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
-#         if not model_on_hub:
-#             return styled_error(f'Model "{model}" {error}')
-#
-#     # Is the model info correctly filled?
-#     try:
-#         model_info = API.model_info(repo_id=model, revision=revision)
-#     except Exception:
-#         return styled_error("Could not get your model information. Please fill it up properly.")
-#
-#     model_size = get_model_size(model_info=model_info, precision=precision)
-#
-#     # Were the model card and license filled?
-#     try:
-#         license = model_info.cardData["license"]
-#     except Exception:
-#         return styled_error("Please select a license for your model")
-#
-#     modelcard_OK, error_msg = check_model_card(model)
-#     if not modelcard_OK:
-#         return styled_error(error_msg)
-#
-#     # Seems good, creating the eval
-#     print("Adding new eval")
-#
-#     eval_entry = {
-#         "model": model,
-#         "base_model": base_model,
-#         "revision": revision,
-#         "precision": precision,
-#         "weight_type": weight_type,
-#         "status": "PENDING",
-#         "submitted_time": current_time,
-#         "model_type": model_type,
-#         "likes": model_info.likes,
-#         "params": model_size,
-#         "license": license,
-#         "private": False,
-#     }
-#
-#     # Check for duplicate submission
-#     if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
-#         return styled_warning("This model has been already submitted.")
-#
-#     print("Creating eval file")
-#     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
-#     os.makedirs(OUT_DIR, exist_ok=True)
-#     out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
-#
-#     with open(out_path, "w") as f:
-#         f.write(json.dumps(eval_entry))
-#
-#     print("Uploading eval file")
-#     API.upload_file(
-#         path_or_fileobj=out_path,
-#         path_in_repo=out_path.split("eval-queue/")[1],
-#         repo_id=QUEUE_REPO,
-#         repo_type="dataset",
-#         commit_message=f"Add {model} to eval queue",
-#     )
-#
-#     # Remove the local file
-#     os.remove(out_path)
-#
-#     return styled_message(
-#         "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
-#     )
 def handle_csv_submission(
     model_name: str,
@@ -129,11 +22,7 @@ def handle_csv_submission(
     if csv_file is None:
         return styled_error("Please provide a CSV file with results.")
-    # print("Handling CSV submission...")
-    # print(type(csv_file))
-    # print(csv_file)
     df = pd.read_csv(csv_file)
-    # print(df)
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
@@ -142,7 +31,7 @@ def handle_csv_submission(
     os.makedirs(subdir, exist_ok=True)
     filename = f"{current_time}_{model_name}_{result_type}_results.csv"
-    remote_path = f"{result_type}_requests/{filename}"
     csv_save_path = os.path.join(subdir,filename)
     df.to_csv(csv_save_path, index=False)
@@ -156,52 +45,99 @@ def handle_csv_submission(
         commit_message=f"Add {result_type} request for {model_name} at {current_time}",
     )
-    #
-    # with open(csv_save_path, "wb") as f:
-    #     f.write(csv_file.read())
-    # Process CSV → JSON
-    # try:
-    #     df = pd.read_csv(csv_save_path)
-    # except Exception as e:
-    #     return styled_error(f"Error reading CSV: {e}")
-    # print(df)
-    return "done"
-        # Expecting columns: benchmark, metric, score
-    # if not all(col in df.columns for col in ["benchmark", "metric", "score"]):
-    #     return styled_error("CSV must contain columns: benchmark, metric, score")
-    #
-    # result_dict = {
-    #     "model": model_name,
-    #     "status": "PENDING",
-    #     "submitted_time": current_time,
-    #     "private": False,
-    #     "results": {},
-    # }
-    #
-    # for _, row in df.iterrows():
-    #     task = row["benchmark"]
-    #     metric = row["metric"]
-    #     score = row["score"]
-    #     if task not in result_dict["results"]:
-    #         result_dict["results"][task] = {}
-    #     result_dict["results"][task][metric] = score
-    #
-    # # Save JSON
-    # json_path = os.path.join(subdir, f"{model_path}_eval_request.json")
-    # with open(json_path, "w") as f:
-    #     json.dump(result_dict, f, indent=2)
-    #
-    # # Upload to HF Hub (if needed)
-    # if QUEUE_REPO:
-    #     API.upload_file(
-    #         path_or_fileobj=json_path,
-    #         path_in_repo=json_path.split("eval-queue/")[1],
-    #         repo_id=QUEUE_REPO,
-    #         repo_type="dataset",
-    #         commit_message=f"Add {model_name} results for {result_type}",
-    #     )
-    #     os.remove(json_path)
-    #
-    # return styled_message(f"Results CSV successfully submitted and processed for `{model_name}`!")

 import pandas as pd
 from datetime import datetime, timezone
+from src.about import Tasks, SpeechTasks
 from src.display.formatting import styled_error, styled_message, styled_warning
+from src.display.utils import REGION_MAP
+from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, RESULTS_REPO, EVAL_RESULTS_PATH
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
 def handle_csv_submission(
     model_name: str,
     if csv_file is None:
         return styled_error("Please provide a CSV file with results.")
     df = pd.read_csv(csv_file)
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     os.makedirs(subdir, exist_ok=True)
     filename = f"{current_time}_{model_name}_{result_type}_results.csv"
+    remote_path = f"msteb_{result_type}_requests/{filename}"
     csv_save_path = os.path.join(subdir,filename)
     df.to_csv(csv_save_path, index=False)
         commit_message=f"Add {result_type} request for {model_name} at {current_time}",
     )
+    # Remove the local file
+    os.remove(csv_save_path)
+    # this converts dataframe to json and uploads it to results
+    try:
+        convert_csv_to_json_and_upload(df, model_name, result_type)
+    except ValueError as e:
+        return styled_error(f"{str(e)}")
+    return styled_message(f"Results CSV successfully submitted for `{model_name}`!")
+def find_task_by_col_name(col_name, enum_cls):
+    for task in enum_cls:
+        if task.value.col_name == col_name:
+            return task
+    return None
+def convert_csv_to_json_and_upload(df: pd.DataFrame, model_name: str, result_type: str):
+    task_enum = Tasks if result_type == "text" else SpeechTasks
+    task_display_names = {t.value.col_name for t in task_enum}
+    region_names = df["Region"].tolist()
+    average_row = "Average (Micro)"
+    # --- Validation ---
+    df_columns = set(df.columns[1:])  # exclude Region column
+    if not df_columns.issubset(task_display_names):
+        extra = df_columns - task_display_names
+        raise ValueError(f"Extra columns in CSV: {extra}")
+    if average_row not in df["Region"].values:
+        raise ValueError("Missing row for 'Average (Micro)'")
+    data_region_names = [r for r in region_names if r != average_row]
+    for region in data_region_names:
+        if region not in REGION_MAP:
+            raise ValueError(f"Region '{region}' not found in REGION_MAP keys.")
+    # --- Build JSON ---
+    # I go over the regions in the CSV and create a JSON object.
+    model_json = {
+        "config": {"model_name": model_name},
+        "results": {},
+        "regions": {},
+    }
+    at_least_one_number = False
+    for _, row in df.iterrows():
+        region_display = row["Region"]
+        if region_display == average_row:
+            for col, val in row.items():
+                if col == "Region":
+                    continue
+                task = find_task_by_col_name(col, task_enum)
+                if val is not None and not pd.isna(val) and isinstance(val, (int, float)):
+                    print(f" value {val}")
+                    at_least_one_number = True
+                    model_json["results"][task.value.benchmark] = {task.value.metric: val/100}
+        else:
+            model_json["regions"][REGION_MAP[region_display]] = {}
+            for col, val in row.items():
+                if col == "Region":
+                    continue
+                task = find_task_by_col_name(col, task_enum)
+                if val is not None and not pd.isna(val) and isinstance(val, (int, float)):
+                    model_json["regions"][REGION_MAP[region_display]][task.value.benchmark] = {task.value.metric: val/100}
+    # Check if at least one number is present in the results
+    print(at_least_one_number)
+    if at_least_one_number is False:
+        raise ValueError("No valid numeric results found in the CSV. Please check your input.")
+    # --- Save locally ---
+    subdir = os.path.join(EVAL_RESULTS_PATH, result_type)
+    os.makedirs(subdir, exist_ok=True)
+    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    filename = f"{current_time}_{model_name}_{result_type}.json"
+    json_save_path = os.path.join(subdir,filename)
+    with open(json_save_path, "w") as f:
+        json.dump(model_json, f, indent=2)
+    # --- Upload to HF Hub ---
+    remote_path = f"msteb_leaderboard/msteb_{result_type}_results/{filename}"
+    API.upload_file(
+        path_or_fileobj=json_save_path,
+        path_in_repo=remote_path,
+        repo_id=RESULTS_REPO,
+        repo_type="dataset",
+        commit_message=f"Upload results for {model_name} ({result_type}) at {current_time}",
+    )
+    os.remove(json_save_path)
+    print(f"Uploaded to {RESULTS_REPO}/{current_time}")
+    return f"Uploaded to {RESULTS_REPO}/{current_time}"

src/submission_samples/model_name_speech.csv CHANGED Viewed

@@ -3,7 +3,7 @@ Africa,,,,,
 Americas/Oceania,,,,,
 Asia (S),,,,,
 Asia (SE),,,,,
-"Asia (W,C)",,,,,
 Asia (E),,,,,
 "Europe (W, N, S)",,,,,
 Europe (E),,,,,

 Americas/Oceania,,,,,
 Asia (S),,,,,
 Asia (SE),,,,,
+"Asia (W, C)",,,,,
 Asia (E),,,,,
 "Europe (W, N, S)",,,,,
 Europe (E),,,,,

src/submission_samples/model_name_text.csv CHANGED Viewed

@@ -3,7 +3,7 @@ Africa,,,,,,
 Americas/Oceania,,,,,,
 Asia (S),,,,,,
 Asia (SE),,,,,,
-"Asia (W,C)",,,,,,
 Asia (E),,,,,,
 "Europe (W, N, S)",,,,,,
 Europe (E),,,,,,

 Americas/Oceania,,,,,,
 Asia (S),,,,,,
 Asia (SE),,,,,,
+"Asia (W, C)",,,,,,
 Asia (E),,,,,,
 "Europe (W, N, S)",,,,,,
 Europe (E),,,,,,