Spaces:

gaia-benchmark
/

leaderboard

Running on CPU Upgrade

App Files Files Community

eval_agent

#51

by huangshouxi - opened about 1 month ago

base: refs/heads/main

←

from: refs/pr/51

Discussion Files changed

+149

-153

Files changed (1) hide show

app.py +149 -153

app.py CHANGED Viewed

@@ -35,7 +35,7 @@ ref_level_len = {"validation": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3:
 os.makedirs("scored", exist_ok=True)
 # Should be False on spaces and True outside
-LOCAL_DEBUG = False #os.environ.get("system", "") != "spaces"
 # Display the results
 eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
@@ -84,168 +84,164 @@ def add_new_eval(
     mail: str,
     profile: gr.OAuthProfile,
 ):
-    try:
-        # Was the profile created less than 2 month ago?
-        user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
-        creation_date = json.loads(user_data.content)["createdAt"]
-        if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
-            return format_error("This account is not authorized to submit on GAIA.")
-        contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
-        user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
-        if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
-            return format_error("You already submitted once today, please try again tomorrow.")
-        is_validation = val_or_test == "validation"
-        # Very basic email parsing
-        _, parsed_mail = parseaddr(mail)
-        if not "@" in parsed_mail:
-            return format_warning("Please provide a valid email adress.")
-        print("Adding new eval")
-        # Check if the combination model/org already exists and prints a warning message if yes
-        if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]):
-            return format_warning("This model has been already submitted.")
-        if path_to_file is None:
-            return format_warning("Please attach a file.")
-        # SAVE UNSCORED SUBMISSION
-        if LOCAL_DEBUG:
-            print("mock uploaded submission")
-        else:
-            api.upload_file(
-                repo_id=SUBMISSION_DATASET,
-                path_or_fileobj=path_to_file.name,
-                path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
-                repo_type="dataset",
-                token=TOKEN
-            )
-        # SAVE CONTACT
-        contact_info = {
-            "model": model,
-            "model_family": model_family,
-            "url": url,
-            "organisation": organisation,
-            "username": profile.username,
-            "mail": mail,
-            "date": datetime.datetime.today().strftime('%Y-%m-%d')
-        }
-        contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
-        if LOCAL_DEBUG:
-            print("mock uploaded contact info")
-        else:
-            contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
-        # SCORE SUBMISSION
-        file_path = path_to_file.name
-        scores = {"all": 0, 1: 0, 2: 0, 3: 0}
-        num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
-        task_ids = []
-        with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
-            with open(file_path, 'r') as f:
-                for ix, line in enumerate(f):
-                    try:
-                        task = json.loads(line)
-                    except Exception:
-                        return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
-                    if "model_answer" not in task:
-                        return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
-                    answer = task["model_answer"]
-                    task_id = task["task_id"]
-                    try:
-                        level = int(gold_results[val_or_test][task_id]["Level"])
-                    except KeyError:
-                        return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
-                    score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
-                    scored_file.write(
-                        json.dumps({
-                            "id": task_id,
-                            "model_answer": answer,
-                            "score": score,
-                            "level": level
-                        }) + "\n"
-                    )
-                    task_ids.append(task_id)
-                    scores["all"] += score
-                    scores[level] += score
-                    num_questions["all"] += 1
-                    num_questions[level] += 1
-        # Check if there's any duplicate in the submission
-        if len(task_ids) != len(set(task_ids)):
-            return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
-        if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
-            return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
-        # SAVE SCORED SUBMISSION
-        if LOCAL_DEBUG:
-            print("mock uploaded scored submission")
-        else:
             api.upload_file(
-                repo_id=SUBMISSION_DATASET,
                 path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
                 path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
                 repo_type="dataset",
                 token=TOKEN
             )
-            # Save scored file
-            if is_validation:
-                api.upload_file(
-                    repo_id=SUBMISSION_DATASET_PUBLIC,
-                    path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
-                    path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
-                    repo_type="dataset",
-                    token=TOKEN
-                )
-        # SAVE TO LEADERBOARD DATA
-        eval_entry = {
-            "model": model,
-            "model_family": model_family,
-            "system_prompt": system_prompt,
-            "url": url,
-            "organisation": organisation,
-            "score": scores["all"]/ref_scores_len[val_or_test],
-            "score_level1": scores[1]/num_questions[1],
-            "score_level2": scores[2]/num_questions[2],
-            "score_level3": scores[3]/num_questions[3],
-            "date": datetime.datetime.today().strftime('%Y-%m-%d')
-        }
-        if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
-            return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
-        # Catching spam submissions of 100%
-        if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
-            return format_error(f"There was a problem with your submission. Please open a discussion.")
-        # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
-        #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
-        #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
-        #if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date):
-        #    return format_error(f"Your submission is an exact duplicate from an existing submission.")
-        eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
-        print(eval_results)
-        if LOCAL_DEBUG:
-            print("mock uploaded results to lb")
-        else:
-            eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
-        return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
-    except Exception as e:
-        print(e)
-        return format_error(f"An error occurred, please open a discussion and indicate at what time you encountered the error.\n")
 def refresh():

 os.makedirs("scored", exist_ok=True)
 # Should be False on spaces and True outside
+LOCAL_DEBUG = False #not (os.environ.get("system") == "spaces")
 # Display the results
 eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
     mail: str,
     profile: gr.OAuthProfile,
 ):
+    # Was the profile created less than 2 month ago?
+    user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
+    creation_date = json.loads(user_data.content)["createdAt"]
+    if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
+        return format_error("This account is not authorized to submit on GAIA.")
+    contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
+    user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
+    if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
+        return format_error("You already submitted once today, please try again tomorrow.")
+    is_validation = val_or_test == "validation"
+    # Very basic email parsing
+    _, parsed_mail = parseaddr(mail)
+    if not "@" in parsed_mail:
+        return format_warning("Please provide a valid email adress.")
+    print("Adding new eval")
+    # Check if the combination model/org already exists and prints a warning message if yes
+    if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]):
+        return format_warning("This model has been already submitted.")
+    if path_to_file is None:
+        return format_warning("Please attach a file.")
+    # SAVE UNSCORED SUBMISSION
+    if LOCAL_DEBUG:
+        print("mock uploaded submission")
+    else:
+        api.upload_file(
+            repo_id=SUBMISSION_DATASET,
+            path_or_fileobj=path_to_file.name,
+            path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
+            repo_type="dataset",
+            token=TOKEN
+        )
+    # SAVE CONTACT
+    contact_info = {
+        "model": model,
+        "model_family": model_family,
+        "url": url,
+        "organisation": organisation,
+        "username": profile.username,
+        "mail": mail,
+        "date": datetime.datetime.today().strftime('%Y-%m-%d')
+    }
+    contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
+    if LOCAL_DEBUG:
+        print("mock uploaded contact info")
+    else:
+        contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
+    # SCORE SUBMISSION
+    file_path = path_to_file.name
+    scores = {"all": 0, 1: 0, 2: 0, 3: 0}
+    num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
+    task_ids = []
+    with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
+        with open(file_path, 'r') as f:
+            for ix, line in enumerate(f):
+                try:
+                    task = json.loads(line)
+                except Exception:
+                    return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
+                if "model_answer" not in task:
+                    return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
+                answer = task["model_answer"]
+                task_id = task["task_id"]
+                try:
+                    level = int(gold_results[val_or_test][task_id]["Level"])
+                except KeyError:
+                    return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
+                score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
+                scored_file.write(
+                    json.dumps({
+                        "id": task_id,
+                        "model_answer": answer,
+                        "score": score,
+                        "level": level
+                    }) + "\n"
+                )
+                task_ids.append(task_id)
+                scores["all"] += score
+                scores[level] += score
+                num_questions["all"] += 1
+                num_questions[level] += 1
+    # Check if there's any duplicate in the submission
+    if len(task_ids) != len(set(task_ids)):
+        return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
+    if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
+        return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
+    # SAVE SCORED SUBMISSION
+    if LOCAL_DEBUG:
+        print("mock uploaded scored submission")
+    else:
+        api.upload_file(
+            repo_id=SUBMISSION_DATASET,
+            path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
+            path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
+            repo_type="dataset",
+            token=TOKEN
+        )
+        # Save scored file
+        if is_validation:
             api.upload_file(
+                repo_id=SUBMISSION_DATASET_PUBLIC,
                 path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
                 path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
                 repo_type="dataset",
                 token=TOKEN
             )
+    # SAVE TO LEADERBOARD DATA
+    eval_entry = {
+        "model": model,
+        "model_family": model_family,
+        "system_prompt": system_prompt,
+        "url": url,
+        "organisation": organisation,
+        "score": scores["all"]/ref_scores_len[val_or_test],
+        "score_level1": scores[1]/num_questions[1],
+        "score_level2": scores[2]/num_questions[2],
+        "score_level3": scores[3]/num_questions[3],
+        "date": datetime.datetime.today().strftime('%Y-%m-%d')
+    }
+    if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
+        return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
+    # Catching spam submissions of 100%
+    if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
+        return format_error(f"There was a problem with your submission. Please open a discussion.")
+    # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
+    #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
+    #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
+    #if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date):
+    #    return format_error(f"Your submission is an exact duplicate from an existing submission.")
+    eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
+    print(eval_results)
+    if LOCAL_DEBUG:
+        print("mock uploaded results to lb")
+    else:
+        eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
+    return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
 def refresh():