Files changed (1) hide show
  1. app.py +149 -153
app.py CHANGED
@@ -35,7 +35,7 @@ ref_level_len = {"validation": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3:
35
  os.makedirs("scored", exist_ok=True)
36
 
37
  # Should be False on spaces and True outside
38
- LOCAL_DEBUG = False #os.environ.get("system", "") != "spaces"
39
 
40
  # Display the results
41
  eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
@@ -84,168 +84,164 @@ def add_new_eval(
84
  mail: str,
85
  profile: gr.OAuthProfile,
86
  ):
87
- try:
88
- # Was the profile created less than 2 month ago?
89
- user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
90
- creation_date = json.loads(user_data.content)["createdAt"]
91
- if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
92
- return format_error("This account is not authorized to submit on GAIA.")
93
-
94
-
95
- contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
96
- user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
97
- if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
98
- return format_error("You already submitted once today, please try again tomorrow.")
99
-
100
-
101
- is_validation = val_or_test == "validation"
102
- # Very basic email parsing
103
- _, parsed_mail = parseaddr(mail)
104
- if not "@" in parsed_mail:
105
- return format_warning("Please provide a valid email adress.")
106
-
107
- print("Adding new eval")
108
-
109
- # Check if the combination model/org already exists and prints a warning message if yes
110
- if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]):
111
- return format_warning("This model has been already submitted.")
112
 
113
- if path_to_file is None:
114
- return format_warning("Please attach a file.")
115
 
116
- # SAVE UNSCORED SUBMISSION
117
- if LOCAL_DEBUG:
118
- print("mock uploaded submission")
119
- else:
120
- api.upload_file(
121
- repo_id=SUBMISSION_DATASET,
122
- path_or_fileobj=path_to_file.name,
123
- path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
124
- repo_type="dataset",
125
- token=TOKEN
126
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- # SAVE CONTACT
129
- contact_info = {
130
- "model": model,
131
- "model_family": model_family,
132
- "url": url,
133
- "organisation": organisation,
134
- "username": profile.username,
135
- "mail": mail,
136
- "date": datetime.datetime.today().strftime('%Y-%m-%d')
137
- }
138
- contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
139
- if LOCAL_DEBUG:
140
- print("mock uploaded contact info")
141
- else:
142
- contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
143
-
144
- # SCORE SUBMISSION
145
- file_path = path_to_file.name
146
- scores = {"all": 0, 1: 0, 2: 0, 3: 0}
147
- num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
148
- task_ids = []
149
- with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
150
- with open(file_path, 'r') as f:
151
- for ix, line in enumerate(f):
152
- try:
153
- task = json.loads(line)
154
- except Exception:
155
- return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
156
-
157
- if "model_answer" not in task:
158
- return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
159
- answer = task["model_answer"]
160
- task_id = task["task_id"]
161
- try:
162
- level = int(gold_results[val_or_test][task_id]["Level"])
163
- except KeyError:
164
- return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
165
-
166
- score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
167
-
168
- scored_file.write(
169
- json.dumps({
170
- "id": task_id,
171
- "model_answer": answer,
172
- "score": score,
173
- "level": level
174
- }) + "\n"
175
- )
176
- task_ids.append(task_id)
177
-
178
- scores["all"] += score
179
- scores[level] += score
180
- num_questions["all"] += 1
181
- num_questions[level] += 1
182
-
183
- # Check if there's any duplicate in the submission
184
- if len(task_ids) != len(set(task_ids)):
185
- return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
186
-
187
- if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
188
- return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
189
-
190
- # SAVE SCORED SUBMISSION
191
- if LOCAL_DEBUG:
192
- print("mock uploaded scored submission")
193
- else:
 
 
 
 
 
 
 
 
 
 
194
  api.upload_file(
195
- repo_id=SUBMISSION_DATASET,
196
  path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
197
  path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
198
  repo_type="dataset",
199
  token=TOKEN
200
  )
201
 
202
- # Save scored file
203
- if is_validation:
204
- api.upload_file(
205
- repo_id=SUBMISSION_DATASET_PUBLIC,
206
- path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
207
- path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
208
- repo_type="dataset",
209
- token=TOKEN
210
- )
211
-
212
- # SAVE TO LEADERBOARD DATA
213
- eval_entry = {
214
- "model": model,
215
- "model_family": model_family,
216
- "system_prompt": system_prompt,
217
- "url": url,
218
- "organisation": organisation,
219
- "score": scores["all"]/ref_scores_len[val_or_test],
220
- "score_level1": scores[1]/num_questions[1],
221
- "score_level2": scores[2]/num_questions[2],
222
- "score_level3": scores[3]/num_questions[3],
223
- "date": datetime.datetime.today().strftime('%Y-%m-%d')
224
- }
225
- if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
226
- return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
227
- # Catching spam submissions of 100%
228
- if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
229
- return format_error(f"There was a problem with your submission. Please open a discussion.")
230
-
231
- # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
232
- #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
233
- #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
234
- #if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date):
235
- # return format_error(f"Your submission is an exact duplicate from an existing submission.")
236
-
237
- eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
238
- print(eval_results)
239
- if LOCAL_DEBUG:
240
- print("mock uploaded results to lb")
241
- else:
242
- eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
243
-
244
-
245
- return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
246
- except Exception as e:
247
- print(e)
248
- return format_error(f"An error occurred, please open a discussion and indicate at what time you encountered the error.\n")
249
 
250
 
251
  def refresh():
 
35
  os.makedirs("scored", exist_ok=True)
36
 
37
  # Should be False on spaces and True outside
38
+ LOCAL_DEBUG = False #not (os.environ.get("system") == "spaces")
39
 
40
  # Display the results
41
  eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
 
84
  mail: str,
85
  profile: gr.OAuthProfile,
86
  ):
87
+ # Was the profile created less than 2 month ago?
88
+ user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
89
+ creation_date = json.loads(user_data.content)["createdAt"]
90
+ if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
91
+ return format_error("This account is not authorized to submit on GAIA.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
 
 
93
 
94
+ contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
95
+ user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
96
+ if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
97
+ return format_error("You already submitted once today, please try again tomorrow.")
98
+
99
+
100
+ is_validation = val_or_test == "validation"
101
+ # Very basic email parsing
102
+ _, parsed_mail = parseaddr(mail)
103
+ if not "@" in parsed_mail:
104
+ return format_warning("Please provide a valid email adress.")
105
+
106
+ print("Adding new eval")
107
+
108
+ # Check if the combination model/org already exists and prints a warning message if yes
109
+ if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]):
110
+ return format_warning("This model has been already submitted.")
111
+
112
+ if path_to_file is None:
113
+ return format_warning("Please attach a file.")
114
+
115
+ # SAVE UNSCORED SUBMISSION
116
+ if LOCAL_DEBUG:
117
+ print("mock uploaded submission")
118
+ else:
119
+ api.upload_file(
120
+ repo_id=SUBMISSION_DATASET,
121
+ path_or_fileobj=path_to_file.name,
122
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
123
+ repo_type="dataset",
124
+ token=TOKEN
125
+ )
126
 
127
+ # SAVE CONTACT
128
+ contact_info = {
129
+ "model": model,
130
+ "model_family": model_family,
131
+ "url": url,
132
+ "organisation": organisation,
133
+ "username": profile.username,
134
+ "mail": mail,
135
+ "date": datetime.datetime.today().strftime('%Y-%m-%d')
136
+ }
137
+ contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
138
+ if LOCAL_DEBUG:
139
+ print("mock uploaded contact info")
140
+ else:
141
+ contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
142
+
143
+ # SCORE SUBMISSION
144
+ file_path = path_to_file.name
145
+ scores = {"all": 0, 1: 0, 2: 0, 3: 0}
146
+ num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
147
+ task_ids = []
148
+ with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
149
+ with open(file_path, 'r') as f:
150
+ for ix, line in enumerate(f):
151
+ try:
152
+ task = json.loads(line)
153
+ except Exception:
154
+ return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
155
+
156
+ if "model_answer" not in task:
157
+ return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
158
+ answer = task["model_answer"]
159
+ task_id = task["task_id"]
160
+ try:
161
+ level = int(gold_results[val_or_test][task_id]["Level"])
162
+ except KeyError:
163
+ return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
164
+
165
+ score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
166
+
167
+ scored_file.write(
168
+ json.dumps({
169
+ "id": task_id,
170
+ "model_answer": answer,
171
+ "score": score,
172
+ "level": level
173
+ }) + "\n"
174
+ )
175
+ task_ids.append(task_id)
176
+
177
+ scores["all"] += score
178
+ scores[level] += score
179
+ num_questions["all"] += 1
180
+ num_questions[level] += 1
181
+
182
+ # Check if there's any duplicate in the submission
183
+ if len(task_ids) != len(set(task_ids)):
184
+ return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
185
+
186
+ if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
187
+ return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
188
+
189
+ # SAVE SCORED SUBMISSION
190
+ if LOCAL_DEBUG:
191
+ print("mock uploaded scored submission")
192
+ else:
193
+ api.upload_file(
194
+ repo_id=SUBMISSION_DATASET,
195
+ path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
196
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
197
+ repo_type="dataset",
198
+ token=TOKEN
199
+ )
200
+
201
+ # Save scored file
202
+ if is_validation:
203
  api.upload_file(
204
+ repo_id=SUBMISSION_DATASET_PUBLIC,
205
  path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
206
  path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
207
  repo_type="dataset",
208
  token=TOKEN
209
  )
210
 
211
+ # SAVE TO LEADERBOARD DATA
212
+ eval_entry = {
213
+ "model": model,
214
+ "model_family": model_family,
215
+ "system_prompt": system_prompt,
216
+ "url": url,
217
+ "organisation": organisation,
218
+ "score": scores["all"]/ref_scores_len[val_or_test],
219
+ "score_level1": scores[1]/num_questions[1],
220
+ "score_level2": scores[2]/num_questions[2],
221
+ "score_level3": scores[3]/num_questions[3],
222
+ "date": datetime.datetime.today().strftime('%Y-%m-%d')
223
+ }
224
+ if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
225
+ return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
226
+ # Catching spam submissions of 100%
227
+ if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
228
+ return format_error(f"There was a problem with your submission. Please open a discussion.")
229
+
230
+ # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
231
+ #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
232
+ #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
233
+ #if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date):
234
+ # return format_error(f"Your submission is an exact duplicate from an existing submission.")
235
+
236
+ eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
237
+ print(eval_results)
238
+ if LOCAL_DEBUG:
239
+ print("mock uploaded results to lb")
240
+ else:
241
+ eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
242
+
243
+
244
+ return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
 
247
  def refresh():