Commit
·
688f116
1
Parent(s):
211c032
Some changes to text, going to try change the upload to two files
Browse files- README.md +1 -1
- about.py +7 -6
- app.py +19 -17
- constants.py +2 -1
- data/example-predictions-cv.csv +1 -1
- data/example-predictions.csv +1 -1
- evaluation.py +3 -9
- requirements.txt +1 -1
- submit.py +0 -3
- utils.py +170 -31
- validation.py +5 -4
README.md
CHANGED
|
@@ -35,4 +35,4 @@ uv run python app.py
|
|
| 35 |
Run tests
|
| 36 |
```
|
| 37 |
uv run pytest
|
| 38 |
-
```
|
|
|
|
| 35 |
Run tests
|
| 36 |
```
|
| 37 |
uv run pytest
|
| 38 |
+
```
|
about.py
CHANGED
|
@@ -4,6 +4,8 @@ from constants import (
|
|
| 4 |
SUBMIT_TAB_NAME,
|
| 5 |
TERMS_URL,
|
| 6 |
FAQ_TAB_NAME,
|
|
|
|
|
|
|
| 7 |
)
|
| 8 |
|
| 9 |
ABOUT_INTRO = f"""
|
|
@@ -21,7 +23,7 @@ Here we invite the community to submit and develop better predictors, which will
|
|
| 21 |
|
| 22 |
For each of the 5 properties in the competition, there is a prize for the model with the highest performance for that property on the private test set.
|
| 23 |
There is also an 'open-source' prize for the best model trained on the GDPa1 dataset of monoclonal antibodies (reporting cross-validation results) and assessed on the private test set where authors provide all training code and data.
|
| 24 |
-
For each of these 6 prizes, participants have the choice between **$10k in data generation credits** with [Ginkgo Datapoints](https://datapoints.ginkgo.bio/) or a **cash prize** with a value of
|
| 25 |
|
| 26 |
See the "{FAQ_TAB_NAME}" tab above (you are currently on the "{ABOUT_TAB_NAME}" tab) or the [competition terms]({TERMS_URL}) for more details.
|
| 27 |
"""
|
|
@@ -42,7 +44,7 @@ Submissions close on **1 November 2025**.
|
|
| 42 |
|
| 43 |
#### Acknowledgements
|
| 44 |
|
| 45 |
-
We gratefully acknowledge [Tamarind Bio](https://www.tamarind.bio/)'s help in running the following models:
|
| 46 |
- TAP (Therapeutic Antibody Profiler)
|
| 47 |
- SaProt
|
| 48 |
- DeepViscosity
|
|
@@ -54,12 +56,11 @@ We're working on getting more public models added, so that participants have mor
|
|
| 54 |
#### How to contribute?
|
| 55 |
|
| 56 |
We'd like to add some more existing developability models to the leaderboard. Some examples of models we'd like to add:
|
| 57 |
-
- ESM embeddings + ridge regression
|
| 58 |
- Absolute folding stability models (for Thermostability)
|
| 59 |
- PROPERMAB
|
| 60 |
- AbMelt (requires GROMACS for MD simulations)
|
| 61 |
|
| 62 |
-
If you would like to
|
| 63 |
"""
|
| 64 |
|
| 65 |
# Note(Lood): Significance: Add another note of "many models are trained on different datasets, and differing train/test splits, so this is a consistent way of comparing for a heldout set"
|
|
@@ -155,8 +156,8 @@ We may release private test set results at intermediate points during the compet
|
|
| 155 |
## Cross-validation
|
| 156 |
|
| 157 |
For the cross-validation metrics (if training only on the GDPa1 dataset), use the `"hierarchical_cluster_IgG_isotype_stratified_fold"` column to split the dataset into folds and make predictions for each of the folds.
|
| 158 |
-
Submit a CSV file in the same format but also containing the `"hierarchical_cluster_IgG_isotype_stratified_fold"` column.
|
| 159 |
-
|
| 160 |
|
| 161 |
Submissions close on **1 November 2025**.
|
| 162 |
"""
|
|
|
|
| 4 |
SUBMIT_TAB_NAME,
|
| 5 |
TERMS_URL,
|
| 6 |
FAQ_TAB_NAME,
|
| 7 |
+
SLACK_URL,
|
| 8 |
+
TUTORIAL_URL,
|
| 9 |
)
|
| 10 |
|
| 11 |
ABOUT_INTRO = f"""
|
|
|
|
| 23 |
|
| 24 |
For each of the 5 properties in the competition, there is a prize for the model with the highest performance for that property on the private test set.
|
| 25 |
There is also an 'open-source' prize for the best model trained on the GDPa1 dataset of monoclonal antibodies (reporting cross-validation results) and assessed on the private test set where authors provide all training code and data.
|
| 26 |
+
For each of these 6 prizes, participants have the choice between **$10k in data generation credits** with [Ginkgo Datapoints](https://datapoints.ginkgo.bio/) or a **cash prize** with a value of **$2000**.
|
| 27 |
|
| 28 |
See the "{FAQ_TAB_NAME}" tab above (you are currently on the "{ABOUT_TAB_NAME}" tab) or the [competition terms]({TERMS_URL}) for more details.
|
| 29 |
"""
|
|
|
|
| 44 |
|
| 45 |
#### Acknowledgements
|
| 46 |
|
| 47 |
+
We gratefully acknowledge [Tamarind Bio](https://www.tamarind.bio/)'s help in running the following models which are on the leaderboard:
|
| 48 |
- TAP (Therapeutic Antibody Profiler)
|
| 49 |
- SaProt
|
| 50 |
- DeepViscosity
|
|
|
|
| 56 |
#### How to contribute?
|
| 57 |
|
| 58 |
We'd like to add some more existing developability models to the leaderboard. Some examples of models we'd like to add:
|
|
|
|
| 59 |
- Absolute folding stability models (for Thermostability)
|
| 60 |
- PROPERMAB
|
| 61 |
- AbMelt (requires GROMACS for MD simulations)
|
| 62 |
|
| 63 |
+
If you would like to form a team or discuss ideas, join the [Slack community]({SLACK_URL}) co-hosted by Bits in Bio.
|
| 64 |
"""
|
| 65 |
|
| 66 |
# Note(Lood): Significance: Add another note of "many models are trained on different datasets, and differing train/test splits, so this is a consistent way of comparing for a heldout set"
|
|
|
|
| 156 |
## Cross-validation
|
| 157 |
|
| 158 |
For the cross-validation metrics (if training only on the GDPa1 dataset), use the `"hierarchical_cluster_IgG_isotype_stratified_fold"` column to split the dataset into folds and make predictions for each of the folds.
|
| 159 |
+
Submit a CSV file in the same format but also containing the `"hierarchical_cluster_IgG_isotype_stratified_fold"` column.
|
| 160 |
+
Check out our tutorial on making an antibody developability prediction model [here]({TUTORIAL_URL}).
|
| 161 |
|
| 162 |
Submissions close on **1 November 2025**.
|
| 163 |
"""
|
app.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
-
import hashlib
|
| 2 |
import pandas as pd
|
| 3 |
import gradio as gr
|
| 4 |
from gradio.themes.utils import sizes
|
| 5 |
from gradio_leaderboard import Leaderboard
|
| 6 |
from dotenv import load_dotenv
|
| 7 |
import contextlib
|
|
|
|
| 8 |
load_dotenv() # Load environment variables from .env file
|
| 9 |
|
| 10 |
from about import ABOUT_INTRO, ABOUT_TEXT, FAQS, SUBMIT_INTRUCTIONS
|
|
@@ -18,9 +18,10 @@ from constants import (
|
|
| 18 |
LEADERBOARD_COLUMNS_RENAME,
|
| 19 |
LEADERBOARD_COLUMNS_RENAME_LIST,
|
| 20 |
SUBMIT_TAB_NAME,
|
|
|
|
| 21 |
)
|
| 22 |
from submit import make_submission
|
| 23 |
-
from utils import fetch_hf_results, show_output_box
|
| 24 |
|
| 25 |
|
| 26 |
def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
|
|
@@ -71,6 +72,7 @@ def get_leaderboard_object(assay: str | None = None):
|
|
| 71 |
fetch_hf_results()
|
| 72 |
current_dataframe = pd.read_csv("debug-current-results.csv")
|
| 73 |
|
|
|
|
| 74 |
def refresh_overall_leaderboard():
|
| 75 |
current_dataframe = pd.read_csv("debug-current-results.csv")
|
| 76 |
return format_leaderboard_table(df_results=current_dataframe)
|
|
@@ -78,6 +80,7 @@ def refresh_overall_leaderboard():
|
|
| 78 |
|
| 79 |
def fetch_latest_data(stop_event):
|
| 80 |
import time
|
|
|
|
| 81 |
while not stop_event.is_set():
|
| 82 |
try:
|
| 83 |
fetch_hf_results()
|
|
@@ -90,6 +93,7 @@ def fetch_latest_data(stop_event):
|
|
| 90 |
@contextlib.asynccontextmanager
|
| 91 |
async def periodic_data_fetch(app):
|
| 92 |
import threading
|
|
|
|
| 93 |
event = threading.Event()
|
| 94 |
t = threading.Thread(target=fetch_latest_data, args=(event,), daemon=True)
|
| 95 |
t.start()
|
|
@@ -98,7 +102,7 @@ async def periodic_data_fetch(app):
|
|
| 98 |
t.join(3)
|
| 99 |
|
| 100 |
|
| 101 |
-
# Lood: Two problems currently:
|
| 102 |
# 1. The data_version state value isn't being incremented, it seems (even though it's triggering the dataframe change correctly)
|
| 103 |
# 2. The global current_dataframe is being shared across all sessions
|
| 104 |
|
|
@@ -165,7 +169,7 @@ with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
|
| 165 |
"""
|
| 166 |
# Overall Leaderboard (filter below by property)
|
| 167 |
Each property has its own prize, and participants can submit models for any combination of properties.
|
| 168 |
-
|
| 169 |
**Note**: It is trivial to overfit the public GDPa1 dataset, which results in very high Spearman correlations.
|
| 170 |
We would suggest training using cross-validation a limited number of times to give a better indication of the model's performance on the eventual private test set.
|
| 171 |
"""
|
|
@@ -182,7 +186,9 @@ with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
|
| 182 |
with gr.TabItem(SUBMIT_TAB_NAME, elem_id="boundary-benchmark-tab-table"):
|
| 183 |
gr.Markdown(SUBMIT_INTRUCTIONS)
|
| 184 |
submission_type_state = gr.State(value="GDPa1_cross_validation")
|
| 185 |
-
download_file_state = gr.State(
|
|
|
|
|
|
|
| 186 |
|
| 187 |
with gr.Row():
|
| 188 |
with gr.Column():
|
|
@@ -215,13 +221,11 @@ with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
|
| 215 |
placeholder="Enter your registration code",
|
| 216 |
info="If you did not receive a registration code, please sign up on the <a href='https://datapoints.ginkgo.bio/ai-competitions/2025-abdev-competition'>Competition Registration page</a> or email <a href='mailto:[email protected]'>[email protected]</a>.",
|
| 217 |
)
|
| 218 |
-
|
| 219 |
# Extra validation / warning
|
| 220 |
# Add the conditional warning checkbox
|
| 221 |
high_corr_warning = gr.Markdown(
|
| 222 |
-
value="",
|
| 223 |
-
visible=False,
|
| 224 |
-
elem_classes=["warning-box"]
|
| 225 |
)
|
| 226 |
high_corr_checkbox = gr.Checkbox(
|
| 227 |
label="I understand this may be overfitting",
|
|
@@ -229,7 +233,7 @@ with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
|
| 229 |
visible=False,
|
| 230 |
info="This checkbox will appear if your submission shows suspiciously high correlations (>0.9).",
|
| 231 |
)
|
| 232 |
-
|
| 233 |
with gr.Column():
|
| 234 |
submission_type_dropdown = gr.Dropdown(
|
| 235 |
choices=["GDPa1", "GDPa1_cross_validation", "Heldout Test Set"],
|
|
@@ -275,10 +279,6 @@ with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
|
| 275 |
|
| 276 |
submit_btn = gr.Button("Evaluate")
|
| 277 |
message = gr.Textbox(label="Status", lines=1, visible=False)
|
| 278 |
-
# help message
|
| 279 |
-
gr.Markdown(
|
| 280 |
-
"If you have issues with submission or using the leaderboard, please start a discussion in the Community tab of this Space."
|
| 281 |
-
)
|
| 282 |
|
| 283 |
submit_btn.click(
|
| 284 |
make_submission,
|
|
@@ -309,12 +309,14 @@ with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
|
| 309 |
gr.Markdown(
|
| 310 |
f"""
|
| 311 |
<div style="text-align: center; font-size: 14px; color: gray; margin-top: 2em;">
|
| 312 |
-
📬 For questions or feedback, contact <a href="mailto:[email protected]">[email protected]</a> or
|
| 313 |
-
Visit the <a href="https://datapoints.ginkgo.bio/ai-competitions/2025-abdev-competition">Competition Registration page</a> to sign up for updates and to register
|
| 314 |
</div>
|
| 315 |
""",
|
| 316 |
elem_id="contact-footer",
|
| 317 |
)
|
| 318 |
|
| 319 |
if __name__ == "__main__":
|
| 320 |
-
demo.launch(
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import gradio as gr
|
| 3 |
from gradio.themes.utils import sizes
|
| 4 |
from gradio_leaderboard import Leaderboard
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
import contextlib
|
| 7 |
+
|
| 8 |
load_dotenv() # Load environment variables from .env file
|
| 9 |
|
| 10 |
from about import ABOUT_INTRO, ABOUT_TEXT, FAQS, SUBMIT_INTRUCTIONS
|
|
|
|
| 18 |
LEADERBOARD_COLUMNS_RENAME,
|
| 19 |
LEADERBOARD_COLUMNS_RENAME_LIST,
|
| 20 |
SUBMIT_TAB_NAME,
|
| 21 |
+
SLACK_URL,
|
| 22 |
)
|
| 23 |
from submit import make_submission
|
| 24 |
+
from utils import fetch_hf_results, show_output_box
|
| 25 |
|
| 26 |
|
| 27 |
def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
|
|
|
|
| 72 |
fetch_hf_results()
|
| 73 |
current_dataframe = pd.read_csv("debug-current-results.csv")
|
| 74 |
|
| 75 |
+
|
| 76 |
def refresh_overall_leaderboard():
|
| 77 |
current_dataframe = pd.read_csv("debug-current-results.csv")
|
| 78 |
return format_leaderboard_table(df_results=current_dataframe)
|
|
|
|
| 80 |
|
| 81 |
def fetch_latest_data(stop_event):
|
| 82 |
import time
|
| 83 |
+
|
| 84 |
while not stop_event.is_set():
|
| 85 |
try:
|
| 86 |
fetch_hf_results()
|
|
|
|
| 93 |
@contextlib.asynccontextmanager
|
| 94 |
async def periodic_data_fetch(app):
|
| 95 |
import threading
|
| 96 |
+
|
| 97 |
event = threading.Event()
|
| 98 |
t = threading.Thread(target=fetch_latest_data, args=(event,), daemon=True)
|
| 99 |
t.start()
|
|
|
|
| 102 |
t.join(3)
|
| 103 |
|
| 104 |
|
| 105 |
+
# Lood: Two problems currently:
|
| 106 |
# 1. The data_version state value isn't being incremented, it seems (even though it's triggering the dataframe change correctly)
|
| 107 |
# 2. The global current_dataframe is being shared across all sessions
|
| 108 |
|
|
|
|
| 169 |
"""
|
| 170 |
# Overall Leaderboard (filter below by property)
|
| 171 |
Each property has its own prize, and participants can submit models for any combination of properties.
|
| 172 |
+
|
| 173 |
**Note**: It is trivial to overfit the public GDPa1 dataset, which results in very high Spearman correlations.
|
| 174 |
We would suggest training using cross-validation a limited number of times to give a better indication of the model's performance on the eventual private test set.
|
| 175 |
"""
|
|
|
|
| 186 |
with gr.TabItem(SUBMIT_TAB_NAME, elem_id="boundary-benchmark-tab-table"):
|
| 187 |
gr.Markdown(SUBMIT_INTRUCTIONS)
|
| 188 |
submission_type_state = gr.State(value="GDPa1_cross_validation")
|
| 189 |
+
download_file_state = gr.State(
|
| 190 |
+
value=EXAMPLE_FILE_DICT["GDPa1_cross_validation"]
|
| 191 |
+
)
|
| 192 |
|
| 193 |
with gr.Row():
|
| 194 |
with gr.Column():
|
|
|
|
| 221 |
placeholder="Enter your registration code",
|
| 222 |
info="If you did not receive a registration code, please sign up on the <a href='https://datapoints.ginkgo.bio/ai-competitions/2025-abdev-competition'>Competition Registration page</a> or email <a href='mailto:[email protected]'>[email protected]</a>.",
|
| 223 |
)
|
| 224 |
+
|
| 225 |
# Extra validation / warning
|
| 226 |
# Add the conditional warning checkbox
|
| 227 |
high_corr_warning = gr.Markdown(
|
| 228 |
+
value="", visible=False, elem_classes=["warning-box"]
|
|
|
|
|
|
|
| 229 |
)
|
| 230 |
high_corr_checkbox = gr.Checkbox(
|
| 231 |
label="I understand this may be overfitting",
|
|
|
|
| 233 |
visible=False,
|
| 234 |
info="This checkbox will appear if your submission shows suspiciously high correlations (>0.9).",
|
| 235 |
)
|
| 236 |
+
|
| 237 |
with gr.Column():
|
| 238 |
submission_type_dropdown = gr.Dropdown(
|
| 239 |
choices=["GDPa1", "GDPa1_cross_validation", "Heldout Test Set"],
|
|
|
|
| 279 |
|
| 280 |
submit_btn = gr.Button("Evaluate")
|
| 281 |
message = gr.Textbox(label="Status", lines=1, visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
submit_btn.click(
|
| 284 |
make_submission,
|
|
|
|
| 309 |
gr.Markdown(
|
| 310 |
f"""
|
| 311 |
<div style="text-align: center; font-size: 14px; color: gray; margin-top: 2em;">
|
| 312 |
+
📬 For questions or feedback, contact <a href="mailto:[email protected]">[email protected]</a> or discuss on the <a href="{SLACK_URL}">Slack community</a> co-hosted by Bits in Bio.<br>
|
| 313 |
+
Visit the <a href="https://datapoints.ginkgo.bio/ai-competitions/2025-abdev-competition">Competition Registration page</a> to sign up for updates and to register, and see Terms <a href="{TERMS_URL}">here</a>.
|
| 314 |
</div>
|
| 315 |
""",
|
| 316 |
elem_id="contact-footer",
|
| 317 |
)
|
| 318 |
|
| 319 |
if __name__ == "__main__":
|
| 320 |
+
demo.launch(
|
| 321 |
+
ssr_mode=False, share=True, app_kwargs={"lifespan": periodic_data_fetch}
|
| 322 |
+
)
|
constants.py
CHANGED
|
@@ -4,7 +4,6 @@ Constants for the Antibody Developability Benchmark
|
|
| 4 |
|
| 5 |
import os
|
| 6 |
from huggingface_hub import HfApi
|
| 7 |
-
import pandas as pd
|
| 8 |
|
| 9 |
ASSAY_LIST = ["AC-SINS_pH7.4", "PR_CHO", "HIC", "Tm2", "Titer"]
|
| 10 |
ASSAY_RENAME = {
|
|
@@ -42,6 +41,8 @@ SUBMIT_TAB_NAME = "✉️ Submit"
|
|
| 42 |
|
| 43 |
REGISTRATION_CODE = os.environ.get("REGISTRATION_CODE")
|
| 44 |
TERMS_URL = "https://euphsfcyogalqiqsawbo.supabase.co/storage/v1/object/public/gdpweb/pdfs/2025%20Ginkgo%20Antibody%20Developability%20Prediction%20Competition%202025-08-28-v2.pdf"
|
|
|
|
|
|
|
| 45 |
|
| 46 |
# Input CSV file requirements
|
| 47 |
REQUIRED_COLUMNS: list[str] = [
|
|
|
|
| 4 |
|
| 5 |
import os
|
| 6 |
from huggingface_hub import HfApi
|
|
|
|
| 7 |
|
| 8 |
ASSAY_LIST = ["AC-SINS_pH7.4", "PR_CHO", "HIC", "Tm2", "Titer"]
|
| 9 |
ASSAY_RENAME = {
|
|
|
|
| 41 |
|
| 42 |
REGISTRATION_CODE = os.environ.get("REGISTRATION_CODE")
|
| 43 |
TERMS_URL = "https://euphsfcyogalqiqsawbo.supabase.co/storage/v1/object/public/gdpweb/pdfs/2025%20Ginkgo%20Antibody%20Developability%20Prediction%20Competition%202025-08-28-v2.pdf"
|
| 44 |
+
SLACK_URL = "https://join.slack.com/t/bitsinbio/shared_invite/zt-3dqigle2b-e0dEkfPPzzWL055j_8N_eQ"
|
| 45 |
+
TUTORIAL_URL = "https://huggingface.co/blog/ginkgo-datapoints/making-antibody-embeddings-and-predictions"
|
| 46 |
|
| 47 |
# Input CSV file requirements
|
| 48 |
REQUIRED_COLUMNS: list[str] = [
|
data/example-predictions-cv.csv
CHANGED
|
@@ -244,4 +244,4 @@ visilizumab,QVQLVQSGAEVKKPGASVKVSCKASGYTFISYTMHWVRQAPGQGLEWMGYINPRSGYTHYNQKLKDKA
|
|
| 244 |
xentuzumab,QVELVESGGGLVQPGGSLRLSCAASGFTFTSYWMSWVRQAPGKGLELVSSITSYGSFTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARNMYTHFDSWGQGTLVTVSS,DIVLTQPPSVSGAPGQRVTISCSGSSSNIGSNSVSWYQQLPGTAPKLLIYDNSKRPSGVPDRFSGSKSGTSASLAITGLQSEDEADYYCQSRDTYGYYWVFGGGTKLTVL,4,-0.6543
|
| 245 |
zalutumumab,QVQLVESGGGVVQPGRSLRLSCAASGFTFSTYGMHWVRQAPGKGLEWVAVIWDDGSYKYYGDSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDGITMVRGVMKDYFDYWGQGTLVTVSS,AIQLTQSPSSLSASVGDRVTITCRASQDISSALVWYQQKPGKAPKLLIYDASSLESGVPSRFSGSESGTDFTLTISSLQPEDFATYYCQQFNSYPLTFGGGTKVEIK,0,-0.5345
|
| 246 |
zanolimumab,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARVINWFDPWGQGTLVTVSS,DIQMTQSPSSVSASVGDRVTITCRASQDISSWLAWYQHKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQANSFPYTFGQGTKLEIK,3,-0.2446
|
| 247 |
-
zolbetuximab,QVQLQQPGAELVRPGASVKLSCKASGYTFTSYWINWVKQRPGQGLEWIGNIYPSDSYTNYNQKFKDKATLTVDKSSSTAYMQLSSPTSEDSAVYYCTRSWRGNSFDYWGQGTTLTVSS,DIVMTQSPSSLTVTAGEKVTMSCKSSQSLLNSGNQKNYLTWYQQKPGQPPKLLIYWASTRESGVPDRFTGSGSGTDFTLTISSVQAEDLAVYYCQNDYSYPFTFGSGTKLEIK,4,-0.3497
|
|
|
|
| 244 |
xentuzumab,QVELVESGGGLVQPGGSLRLSCAASGFTFTSYWMSWVRQAPGKGLELVSSITSYGSFTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARNMYTHFDSWGQGTLVTVSS,DIVLTQPPSVSGAPGQRVTISCSGSSSNIGSNSVSWYQQLPGTAPKLLIYDNSKRPSGVPDRFSGSKSGTSASLAITGLQSEDEADYYCQSRDTYGYYWVFGGGTKLTVL,4,-0.6543
|
| 245 |
zalutumumab,QVQLVESGGGVVQPGRSLRLSCAASGFTFSTYGMHWVRQAPGKGLEWVAVIWDDGSYKYYGDSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDGITMVRGVMKDYFDYWGQGTLVTVSS,AIQLTQSPSSLSASVGDRVTITCRASQDISSALVWYQQKPGKAPKLLIYDASSLESGVPSRFSGSESGTDFTLTISSLQPEDFATYYCQQFNSYPLTFGGGTKVEIK,0,-0.5345
|
| 246 |
zanolimumab,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARVINWFDPWGQGTLVTVSS,DIQMTQSPSSVSASVGDRVTITCRASQDISSWLAWYQHKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQANSFPYTFGQGTKLEIK,3,-0.2446
|
| 247 |
+
zolbetuximab,QVQLQQPGAELVRPGASVKLSCKASGYTFTSYWINWVKQRPGQGLEWIGNIYPSDSYTNYNQKFKDKATLTVDKSSSTAYMQLSSPTSEDSAVYYCTRSWRGNSFDYWGQGTTLTVSS,DIVMTQSPSSLTVTAGEKVTMSCKSSQSLLNSGNQKNYLTWYQQKPGQPPKLLIYWASTRESGVPDRFTGSGSGTDFTLTISSVQAEDLAVYYCQNDYSYPFTFGSGTKLEIK,4,-0.3497
|
data/example-predictions.csv
CHANGED
|
@@ -244,4 +244,4 @@ visilizumab,QVQLVQSGAEVKKPGASVKVSCKASGYTFISYTMHWVRQAPGQGLEWMGYINPRSGYTHYNQKLKDKA
|
|
| 244 |
xentuzumab,QVELVESGGGLVQPGGSLRLSCAASGFTFTSYWMSWVRQAPGKGLELVSSITSYGSFTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARNMYTHFDSWGQGTLVTVSS,DIVLTQPPSVSGAPGQRVTISCSGSSSNIGSNSVSWYQQLPGTAPKLLIYDNSKRPSGVPDRFSGSKSGTSASLAITGLQSEDEADYYCQSRDTYGYYWVFGGGTKLTVL,-0.6543
|
| 245 |
zalutumumab,QVQLVESGGGVVQPGRSLRLSCAASGFTFSTYGMHWVRQAPGKGLEWVAVIWDDGSYKYYGDSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDGITMVRGVMKDYFDYWGQGTLVTVSS,AIQLTQSPSSLSASVGDRVTITCRASQDISSALVWYQQKPGKAPKLLIYDASSLESGVPSRFSGSESGTDFTLTISSLQPEDFATYYCQQFNSYPLTFGGGTKVEIK,-0.5345
|
| 246 |
zanolimumab,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARVINWFDPWGQGTLVTVSS,DIQMTQSPSSVSASVGDRVTITCRASQDISSWLAWYQHKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQANSFPYTFGQGTKLEIK,-0.2446
|
| 247 |
-
zolbetuximab,QVQLQQPGAELVRPGASVKLSCKASGYTFTSYWINWVKQRPGQGLEWIGNIYPSDSYTNYNQKFKDKATLTVDKSSSTAYMQLSSPTSEDSAVYYCTRSWRGNSFDYWGQGTTLTVSS,DIVMTQSPSSLTVTAGEKVTMSCKSSQSLLNSGNQKNYLTWYQQKPGQPPKLLIYWASTRESGVPDRFTGSGSGTDFTLTISSVQAEDLAVYYCQNDYSYPFTFGSGTKLEIK,-0.3497
|
|
|
|
| 244 |
xentuzumab,QVELVESGGGLVQPGGSLRLSCAASGFTFTSYWMSWVRQAPGKGLELVSSITSYGSFTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARNMYTHFDSWGQGTLVTVSS,DIVLTQPPSVSGAPGQRVTISCSGSSSNIGSNSVSWYQQLPGTAPKLLIYDNSKRPSGVPDRFSGSKSGTSASLAITGLQSEDEADYYCQSRDTYGYYWVFGGGTKLTVL,-0.6543
|
| 245 |
zalutumumab,QVQLVESGGGVVQPGRSLRLSCAASGFTFSTYGMHWVRQAPGKGLEWVAVIWDDGSYKYYGDSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDGITMVRGVMKDYFDYWGQGTLVTVSS,AIQLTQSPSSLSASVGDRVTITCRASQDISSALVWYQQKPGKAPKLLIYDASSLESGVPSRFSGSESGTDFTLTISSLQPEDFATYYCQQFNSYPLTFGGGTKVEIK,-0.5345
|
| 246 |
zanolimumab,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARVINWFDPWGQGTLVTVSS,DIQMTQSPSSVSASVGDRVTITCRASQDISSWLAWYQHKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQANSFPYTFGQGTKLEIK,-0.2446
|
| 247 |
+
zolbetuximab,QVQLQQPGAELVRPGASVKLSCKASGYTFTSYWINWVKQRPGQGLEWIGNIYPSDSYTNYNQKFKDKATLTVDKSSSTAYMQLSSPTSEDSAVYYCTRSWRGNSFDYWGQGTTLTVSS,DIVMTQSPSSLTVTAGEKVTMSCKSSQSLLNSGNQKNYLTWYQQKPGQPPKLLIYWASTRESGVPDRFTGSGSGTDFTLTISSVQAEDLAVYYCQNDYSYPFTFGSGTKLEIK,-0.3497
|
evaluation.py
CHANGED
|
@@ -123,9 +123,7 @@ def evaluate(predictions_df, target_df, dataset_name="GDPa1"):
|
|
| 123 |
eg. my_model.csv has columns antibody_name, HIC, Tm2
|
| 124 |
Lood: Copied from Github repo, which I should move over here
|
| 125 |
"""
|
| 126 |
-
properties_in_preds = [
|
| 127 |
-
col for col in predictions_df.columns if col in ASSAY_LIST
|
| 128 |
-
]
|
| 129 |
df_merged = pd.merge(
|
| 130 |
target_df[["antibody_name", FOLD_COL] + ASSAY_LIST],
|
| 131 |
predictions_df[["antibody_name"] + properties_in_preds],
|
|
@@ -137,15 +135,11 @@ def evaluate(predictions_df, target_df, dataset_name="GDPa1"):
|
|
| 137 |
# Process each property one by one for better error handling
|
| 138 |
for assay_col in properties_in_preds:
|
| 139 |
try:
|
| 140 |
-
results = _get_result_for_assay(
|
| 141 |
-
df_merged, assay_col, dataset_name
|
| 142 |
-
)
|
| 143 |
results_list.append(results)
|
| 144 |
|
| 145 |
except Exception as e:
|
| 146 |
-
error_result = _get_error_result(
|
| 147 |
-
assay_col, dataset_name, e
|
| 148 |
-
)
|
| 149 |
results_list.append(error_result)
|
| 150 |
|
| 151 |
results_df = pd.DataFrame(results_list)
|
|
|
|
| 123 |
eg. my_model.csv has columns antibody_name, HIC, Tm2
|
| 124 |
Lood: Copied from Github repo, which I should move over here
|
| 125 |
"""
|
| 126 |
+
properties_in_preds = [col for col in predictions_df.columns if col in ASSAY_LIST]
|
|
|
|
|
|
|
| 127 |
df_merged = pd.merge(
|
| 128 |
target_df[["antibody_name", FOLD_COL] + ASSAY_LIST],
|
| 129 |
predictions_df[["antibody_name"] + properties_in_preds],
|
|
|
|
| 135 |
# Process each property one by one for better error handling
|
| 136 |
for assay_col in properties_in_preds:
|
| 137 |
try:
|
| 138 |
+
results = _get_result_for_assay(df_merged, assay_col, dataset_name)
|
|
|
|
|
|
|
| 139 |
results_list.append(results)
|
| 140 |
|
| 141 |
except Exception as e:
|
| 142 |
+
error_result = _get_error_result(assay_col, dataset_name, e)
|
|
|
|
|
|
|
| 143 |
results_list.append(error_result)
|
| 144 |
|
| 145 |
results_df = pd.DataFrame(results_list)
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
gradio
|
| 2 |
datasets
|
|
|
|
| 3 |
huggingface_hub
|
| 4 |
gradio-leaderboard
|
| 5 |
gradio[oauth]
|
|
|
|
|
|
|
| 1 |
datasets
|
| 2 |
+
dotenv
|
| 3 |
huggingface_hub
|
| 4 |
gradio-leaderboard
|
| 5 |
gradio[oauth]
|
submit.py
CHANGED
|
@@ -68,7 +68,6 @@ def make_submission(
|
|
| 68 |
registration_code: str = "",
|
| 69 |
# profile: gr.OAuthProfile | None = None,
|
| 70 |
):
|
| 71 |
-
|
| 72 |
# if profile:
|
| 73 |
# user_state = profile.name
|
| 74 |
# user_state = user_state
|
|
@@ -98,8 +97,6 @@ def make_submission(
|
|
| 98 |
if path_obj.suffix.lower() != ".csv":
|
| 99 |
raise gr.Error("File must be a CSV file. Please upload a .csv file.")
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
upload_submission(
|
| 104 |
file_path=path_obj,
|
| 105 |
user_state=user_state,
|
|
|
|
| 68 |
registration_code: str = "",
|
| 69 |
# profile: gr.OAuthProfile | None = None,
|
| 70 |
):
|
|
|
|
| 71 |
# if profile:
|
| 72 |
# user_state = profile.name
|
| 73 |
# user_state = user_state
|
|
|
|
| 97 |
if path_obj.suffix.lower() != ".csv":
|
| 98 |
raise gr.Error("File must be a CSV file. Please upload a .csv file.")
|
| 99 |
|
|
|
|
|
|
|
| 100 |
upload_submission(
|
| 101 |
file_path=path_obj,
|
| 102 |
user_state=user_state,
|
utils.py
CHANGED
|
@@ -1,10 +1,18 @@
|
|
| 1 |
from datetime import datetime, timezone, timedelta
|
| 2 |
-
import pandas as pd
|
| 3 |
-
from datasets import load_dataset
|
| 4 |
-
import gradio as gr
|
| 5 |
import hashlib
|
|
|
|
| 6 |
from typing import Iterable, Union
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
pd.set_option("display.max_columns", None)
|
| 10 |
|
|
@@ -15,7 +23,11 @@ def get_time(tz_name="EST") -> str:
|
|
| 15 |
print("Invalid timezone, using EST")
|
| 16 |
tz_name = "EST"
|
| 17 |
offset = offsets[tz_name]
|
| 18 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
def show_output_box(message):
|
| 21 |
return gr.update(value=message, visible=True)
|
|
@@ -32,56 +44,183 @@ def fetch_hf_results():
|
|
| 32 |
RESULTS_REPO,
|
| 33 |
data_files="auto_submissions/metrics_all.csv",
|
| 34 |
)["train"].to_pandas()
|
| 35 |
-
print("fetched results from HF", df.shape)
|
| 36 |
assert all(
|
| 37 |
col in df.columns for col in LEADERBOARD_RESULTS_COLUMNS
|
| 38 |
), f"Expected columns {LEADERBOARD_RESULTS_COLUMNS} not found in {df.columns}. Missing columns: {set(LEADERBOARD_RESULTS_COLUMNS) - set(df.columns)}"
|
| 39 |
-
|
| 40 |
df_baseline = df[df["user"].isin(BASELINE_USERNAMES)]
|
| 41 |
df_non_baseline = df[~df["user"].isin(BASELINE_USERNAMES)]
|
| 42 |
# Show latest submission only
|
| 43 |
# For baselines: Keep unique model names
|
| 44 |
-
df_baseline = df_baseline.sort_values(
|
| 45 |
-
|
| 46 |
-
)
|
| 47 |
# For users: Just show latest submission
|
| 48 |
-
df_non_baseline = df_non_baseline.sort_values(
|
| 49 |
-
|
| 50 |
-
)
|
| 51 |
df = pd.concat([df_baseline, df_non_baseline], ignore_index=True)
|
| 52 |
df["property"] = df["assay"].map(ASSAY_RENAME)
|
| 53 |
-
|
| 54 |
# Rename baseline username to just "Baseline"
|
| 55 |
df.loc[df["user"].isin(BASELINE_USERNAMES), "user"] = "Baseline"
|
| 56 |
# Note: Could optionally add a column "is_baseline" to the dataframe to indicate whether the model is a baseline model or not. If things get crowded.
|
| 57 |
# Anonymize the user column at this point (so note: users can submit anonymous / non-anonymous and we'll show their latest submission regardless)
|
| 58 |
-
df.loc[df["anonymous"] != False, "user"] = "anon-" + df.loc[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
df.to_csv("debug-current-results.csv", index=False)
|
| 60 |
|
| 61 |
|
| 62 |
# Readable hashing function similar to coolname or codenamize
|
| 63 |
ADJECTIVES = [
|
| 64 |
-
"ancient",
|
| 65 |
-
"
|
| 66 |
-
"
|
| 67 |
-
"
|
| 68 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
]
|
| 70 |
|
| 71 |
ANIMALS = [
|
| 72 |
-
"ant",
|
| 73 |
-
"
|
| 74 |
-
"
|
| 75 |
-
"
|
| 76 |
-
"
|
| 77 |
-
"
|
| 78 |
-
"
|
| 79 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
]
|
| 81 |
NOUNS = [
|
| 82 |
-
"rock",
|
| 83 |
-
"
|
| 84 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
]
|
| 86 |
|
| 87 |
|
|
|
|
| 1 |
from datetime import datetime, timezone, timedelta
|
|
|
|
|
|
|
|
|
|
| 2 |
import hashlib
|
| 3 |
+
import os
|
| 4 |
from typing import Iterable, Union
|
| 5 |
+
|
| 6 |
+
from datasets import load_dataset
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
from constants import (
|
| 11 |
+
RESULTS_REPO,
|
| 12 |
+
ASSAY_RENAME,
|
| 13 |
+
LEADERBOARD_RESULTS_COLUMNS,
|
| 14 |
+
BASELINE_USERNAMES,
|
| 15 |
+
)
|
| 16 |
|
| 17 |
pd.set_option("display.max_columns", None)
|
| 18 |
|
|
|
|
| 23 |
print("Invalid timezone, using EST")
|
| 24 |
tz_name = "EST"
|
| 25 |
offset = offsets[tz_name]
|
| 26 |
+
return (
|
| 27 |
+
datetime.now(timezone(timedelta(hours=offset))).strftime("%Y-%m-%d %H:%M:%S")
|
| 28 |
+
+ f" ({tz_name})"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
|
| 32 |
def show_output_box(message):
|
| 33 |
return gr.update(value=message, visible=True)
|
|
|
|
| 44 |
RESULTS_REPO,
|
| 45 |
data_files="auto_submissions/metrics_all.csv",
|
| 46 |
)["train"].to_pandas()
|
|
|
|
| 47 |
assert all(
|
| 48 |
col in df.columns for col in LEADERBOARD_RESULTS_COLUMNS
|
| 49 |
), f"Expected columns {LEADERBOARD_RESULTS_COLUMNS} not found in {df.columns}. Missing columns: {set(LEADERBOARD_RESULTS_COLUMNS) - set(df.columns)}"
|
| 50 |
+
|
| 51 |
df_baseline = df[df["user"].isin(BASELINE_USERNAMES)]
|
| 52 |
df_non_baseline = df[~df["user"].isin(BASELINE_USERNAMES)]
|
| 53 |
# Show latest submission only
|
| 54 |
# For baselines: Keep unique model names
|
| 55 |
+
df_baseline = df_baseline.sort_values(
|
| 56 |
+
"submission_time", ascending=False
|
| 57 |
+
).drop_duplicates(subset=["model", "assay", "dataset", "user"], keep="first")
|
| 58 |
# For users: Just show latest submission
|
| 59 |
+
df_non_baseline = df_non_baseline.sort_values(
|
| 60 |
+
"submission_time", ascending=False
|
| 61 |
+
).drop_duplicates(subset=["assay", "dataset", "user"], keep="first")
|
| 62 |
df = pd.concat([df_baseline, df_non_baseline], ignore_index=True)
|
| 63 |
df["property"] = df["assay"].map(ASSAY_RENAME)
|
| 64 |
+
|
| 65 |
# Rename baseline username to just "Baseline"
|
| 66 |
df.loc[df["user"].isin(BASELINE_USERNAMES), "user"] = "Baseline"
|
| 67 |
# Note: Could optionally add a column "is_baseline" to the dataframe to indicate whether the model is a baseline model or not. If things get crowded.
|
| 68 |
# Anonymize the user column at this point (so note: users can submit anonymous / non-anonymous and we'll show their latest submission regardless)
|
| 69 |
+
df.loc[df["anonymous"] != False, "user"] = "anon-" + df.loc[
|
| 70 |
+
df["anonymous"] != False, "user"
|
| 71 |
+
].apply(readable_hash)
|
| 72 |
+
|
| 73 |
+
# Compare to previous dataframe
|
| 74 |
+
if os.path.exists("debug-current-results.csv"):
|
| 75 |
+
old_df = pd.read_csv("debug-current-results.csv")
|
| 76 |
+
else:
|
| 77 |
+
old_df = df
|
| 78 |
+
if len(df) != len(old_df):
|
| 79 |
+
print(f"New results: Length {len(old_df)} -> {len(df)} ({get_time()})")
|
| 80 |
+
|
| 81 |
df.to_csv("debug-current-results.csv", index=False)
|
| 82 |
|
| 83 |
|
| 84 |
# Readable hashing function similar to coolname or codenamize
|
| 85 |
ADJECTIVES = [
|
| 86 |
+
"ancient",
|
| 87 |
+
"brave",
|
| 88 |
+
"calm",
|
| 89 |
+
"clever",
|
| 90 |
+
"crimson",
|
| 91 |
+
"curious",
|
| 92 |
+
"dapper",
|
| 93 |
+
"eager",
|
| 94 |
+
"fuzzy",
|
| 95 |
+
"gentle",
|
| 96 |
+
"glowing",
|
| 97 |
+
"golden",
|
| 98 |
+
"happy",
|
| 99 |
+
"icy",
|
| 100 |
+
"jolly",
|
| 101 |
+
"lucky",
|
| 102 |
+
"magical",
|
| 103 |
+
"mellow",
|
| 104 |
+
"nimble",
|
| 105 |
+
"peachy",
|
| 106 |
+
"quick",
|
| 107 |
+
"royal",
|
| 108 |
+
"shiny",
|
| 109 |
+
"silent",
|
| 110 |
+
"sly",
|
| 111 |
+
"sparkly",
|
| 112 |
+
"spicy",
|
| 113 |
+
"spry",
|
| 114 |
+
"sturdy",
|
| 115 |
+
"sunny",
|
| 116 |
+
"swift",
|
| 117 |
+
"tiny",
|
| 118 |
+
"vivid",
|
| 119 |
+
"witty",
|
| 120 |
]
|
| 121 |
|
| 122 |
ANIMALS = [
|
| 123 |
+
"ant",
|
| 124 |
+
"bat",
|
| 125 |
+
"bear",
|
| 126 |
+
"bee",
|
| 127 |
+
"bison",
|
| 128 |
+
"boar",
|
| 129 |
+
"bug",
|
| 130 |
+
"cat",
|
| 131 |
+
"crab",
|
| 132 |
+
"crow",
|
| 133 |
+
"deer",
|
| 134 |
+
"dog",
|
| 135 |
+
"duck",
|
| 136 |
+
"eel",
|
| 137 |
+
"elk",
|
| 138 |
+
"fox",
|
| 139 |
+
"frog",
|
| 140 |
+
"goat",
|
| 141 |
+
"gull",
|
| 142 |
+
"hare",
|
| 143 |
+
"hawk",
|
| 144 |
+
"hen",
|
| 145 |
+
"horse",
|
| 146 |
+
"ibis",
|
| 147 |
+
"kid",
|
| 148 |
+
"kiwi",
|
| 149 |
+
"koala",
|
| 150 |
+
"lamb",
|
| 151 |
+
"lark",
|
| 152 |
+
"lemur",
|
| 153 |
+
"lion",
|
| 154 |
+
"llama",
|
| 155 |
+
"loon",
|
| 156 |
+
"lynx",
|
| 157 |
+
"mole",
|
| 158 |
+
"moose",
|
| 159 |
+
"mouse",
|
| 160 |
+
"newt",
|
| 161 |
+
"otter",
|
| 162 |
+
"owl",
|
| 163 |
+
"ox",
|
| 164 |
+
"panda",
|
| 165 |
+
"pig",
|
| 166 |
+
"prawn",
|
| 167 |
+
"puma",
|
| 168 |
+
"quail",
|
| 169 |
+
"quokka",
|
| 170 |
+
"rabbit",
|
| 171 |
+
"rat",
|
| 172 |
+
"ray",
|
| 173 |
+
"robin",
|
| 174 |
+
"seal",
|
| 175 |
+
"shark",
|
| 176 |
+
"sheep",
|
| 177 |
+
"shrew",
|
| 178 |
+
"skunk",
|
| 179 |
+
"slug",
|
| 180 |
+
"snail",
|
| 181 |
+
"snake",
|
| 182 |
+
"swan",
|
| 183 |
+
"toad",
|
| 184 |
+
"trout",
|
| 185 |
+
"turtle",
|
| 186 |
+
"vole",
|
| 187 |
+
"walrus",
|
| 188 |
+
"wasp",
|
| 189 |
+
"whale",
|
| 190 |
+
"wolf",
|
| 191 |
+
"worm",
|
| 192 |
+
"yak",
|
| 193 |
+
"zebra",
|
| 194 |
]
|
| 195 |
NOUNS = [
|
| 196 |
+
"rock",
|
| 197 |
+
"sand",
|
| 198 |
+
"star",
|
| 199 |
+
"tree",
|
| 200 |
+
"leaf",
|
| 201 |
+
"seed",
|
| 202 |
+
"stone",
|
| 203 |
+
"cloud",
|
| 204 |
+
"rain",
|
| 205 |
+
"snow",
|
| 206 |
+
"wind",
|
| 207 |
+
"fire",
|
| 208 |
+
"ash",
|
| 209 |
+
"dirt",
|
| 210 |
+
"mud",
|
| 211 |
+
"ice",
|
| 212 |
+
"wave",
|
| 213 |
+
"shell",
|
| 214 |
+
"dust",
|
| 215 |
+
"sun",
|
| 216 |
+
"moon",
|
| 217 |
+
"hill",
|
| 218 |
+
"lake",
|
| 219 |
+
"pond",
|
| 220 |
+
"reef",
|
| 221 |
+
"root",
|
| 222 |
+
"twig",
|
| 223 |
+
"wood",
|
| 224 |
]
|
| 225 |
|
| 226 |
|
validation.py
CHANGED
|
@@ -138,7 +138,6 @@ def validate_cv_submission(
|
|
| 138 |
raise gr.Error(
|
| 139 |
f"❌ Fold assignments don't match canonical CV folds: {'; '.join(examples)}"
|
| 140 |
)
|
| 141 |
-
|
| 142 |
|
| 143 |
|
| 144 |
def validate_full_dataset_submission(df: pd.DataFrame) -> None:
|
|
@@ -204,7 +203,7 @@ def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None
|
|
| 204 |
raise gr.Error(
|
| 205 |
f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates."
|
| 206 |
)
|
| 207 |
-
|
| 208 |
example_df = pd.read_csv(EXAMPLE_FILE_DICT[submission_type])
|
| 209 |
# All antibody names should be recognizable
|
| 210 |
unrecognized_antibodies = set(df["antibody_name"]) - set(
|
|
@@ -229,11 +228,13 @@ def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None
|
|
| 229 |
validate_cv_submission(df, submission_type)
|
| 230 |
else: # full_dataset
|
| 231 |
validate_full_dataset_submission(df)
|
| 232 |
-
|
| 233 |
# Check Spearman correlations on public set
|
| 234 |
df_gdpa1 = pd.read_csv(GDPa1_path)
|
| 235 |
if submission_type in ["GDPa1", "GDPa1_cross_validation"]:
|
| 236 |
-
results_df = evaluate(
|
|
|
|
|
|
|
| 237 |
# Check that the Spearman correlations are not too high
|
| 238 |
if results_df["spearman"].max() > 0.9:
|
| 239 |
raise gr.Error(
|
|
|
|
| 138 |
raise gr.Error(
|
| 139 |
f"❌ Fold assignments don't match canonical CV folds: {'; '.join(examples)}"
|
| 140 |
)
|
|
|
|
| 141 |
|
| 142 |
|
| 143 |
def validate_full_dataset_submission(df: pd.DataFrame) -> None:
|
|
|
|
| 203 |
raise gr.Error(
|
| 204 |
f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates."
|
| 205 |
)
|
| 206 |
+
|
| 207 |
example_df = pd.read_csv(EXAMPLE_FILE_DICT[submission_type])
|
| 208 |
# All antibody names should be recognizable
|
| 209 |
unrecognized_antibodies = set(df["antibody_name"]) - set(
|
|
|
|
| 228 |
validate_cv_submission(df, submission_type)
|
| 229 |
else: # full_dataset
|
| 230 |
validate_full_dataset_submission(df)
|
| 231 |
+
|
| 232 |
# Check Spearman correlations on public set
|
| 233 |
df_gdpa1 = pd.read_csv(GDPa1_path)
|
| 234 |
if submission_type in ["GDPa1", "GDPa1_cross_validation"]:
|
| 235 |
+
results_df = evaluate(
|
| 236 |
+
predictions_df=df, target_df=df_gdpa1, dataset_name=submission_type
|
| 237 |
+
)
|
| 238 |
# Check that the Spearman correlations are not too high
|
| 239 |
if results_df["spearman"].max() > 0.9:
|
| 240 |
raise gr.Error(
|