loodvanniekerkginkgo's picture
Added new metrics (here and to dataset), and loaded results once
de75bee
raw
history blame
7.75 kB
from pathlib import Path
import json
import pandas as pd
import gradio as gr
from gradio_leaderboard import Leaderboard
from utils import read_submission_from_hub, write_results
from about import ASSAY_LIST, ASSAY_RENAME, ASSAY_EMOJIS, submissions_repo, API, results_repo
from typing import BinaryIO, Literal
from datetime import datetime
import tempfile
from datasets import load_dataset
import io
def make_submission(
submitted_file: BinaryIO,
user_state):
if user_state is None:
raise gr.Error("You must submit your username to submit a file.")
file_path = submitted_file.name
if not file_path:
raise gr.Error("Uploaded file object does not have a valid file path.")
path_obj = Path(file_path)
timestamp = datetime.utcnow().isoformat()
with (path_obj.open("rb") as f_in):
file_content = f_in.read().decode("utf-8")
# write to dataset
filename = f"{user_state}/{timestamp.replace(':', '-')}_{user_state}.json"
record = {
"submission_filename": filename,
"submission_time": timestamp,
"csv_content": file_content,
"evaluated": False,
"user": user_state,
}
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
json.dump(record, tmp, indent=2)
tmp.flush()
tmp_name = tmp.name
API.upload_file(
path_or_fileobj=tmp_name,
path_in_repo=filename,
repo_id=submissions_repo,
repo_type="dataset",
commit_message=f"Add submission for {user_state} at {timestamp}"
)
Path(tmp_name).unlink()
return "✅ Your submission has been received! Sit tight and your scores will appear on the leaderboard shortly."
def get_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
# ds = load_dataset(results_repo, split='train', download_mode="force_redownload")
# full_df = pd.DataFrame(ds)
# full_df['full results'] = full_df['result_filename'].apply(lambda x: make_boundary_clickable(x)).astype(str)
# full_df.rename(columns={'submission_time': 'submission time', 'problem_type': 'problem type'}, inplace=True)
# to_show = full_df.copy(deep=True)
# to_show = to_show[to_show['user'] != 'test']
# to_show = to_show[['submission time', 'problem type', 'user', 'score', 'full results']]
# to_show['user'] = to_show['user'].apply(lambda x: make_user_clickable(x)).astype(str)
# Previously hosted on HF hub, local for now (Can also pull directly from github backend)
column_order = ["model", "property", "spearman", "spearman_abs"]
df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
if assay is not None:
df = df[df['assay'] == assay]
df = df[column_order]
return df.sort_values(by="spearman_abs", ascending=False)
def get_leaderboard_object(df_results: pd.DataFrame, assay: str | None = None):
df = get_leaderboard_table(df_results=df_results, assay=assay)
filter_columns = ["model"]
if assay is None:
filter_columns.append("property")
# TODO how to sort filter columns alphabetically?
Leaderboard(
value=df,
datatype=["str", "str", "str", "number"],
select_columns=["model", "property", "spearman"],
search_columns=["model"],
filter_columns=filter_columns,
every=60,
render=True
)
def show_output_box(message):
return gr.update(value=message, visible=True)
def fetch_hf_results():
ds = load_dataset(results_repo, split='no_low_spearman', download_mode="force_redownload")
df = pd.DataFrame(ds).drop_duplicates(subset=["model", "assay"])
df["property"] = df["assay"].map(ASSAY_RENAME)
return df
with gr.Blocks() as demo:
gr.Markdown("""
## Welcome to the Ginkgo Antibody Developability Benchmark Leaderboard!
Participants can submit their model to the leaderboard by
""")
df = fetch_hf_results()
with gr.Tabs(elem_classes="tab-buttons"):
# Procedurally make these 5 tabs
for assay in ASSAY_LIST:
with gr.TabItem(f"{ASSAY_EMOJIS[assay]} {ASSAY_RENAME[assay]}", elem_id=f"abdev-benchmark-tab-table"):
gr.Markdown(f"# {ASSAY_RENAME[assay]} (measured by {assay})")
get_leaderboard_object(df_results=df, assay=assay)
with gr.TabItem("🚀 Overall", elem_id="abdev-benchmark-tab-table"):
gr.Markdown("# Antibody Developability Benchmark Leaderboard over all properties")
get_leaderboard_object(df_results=df)
# TODO: this is not going to update well, need to fix
with gr.TabItem("❔About", elem_id="abdev-benchmark-tab-table"):
gr.Markdown(
"""
## About this challenge
We're inviting the ML/bio community to predict developability properties for 244 antibodies from the [GDPa1 dataset](https://huggingface.co/datasets/ginkgo-datapoints/GDPa1).
**What is antibody developability?**
Antibodies have to be manufacturable, stable in high concentrations, and have low off-target effects.
Properties such as these can often hinder the progression of an antibody to the clinic, and are collectively referred to as 'developability'.
Here we show 5 of these properties and invite the community to submit and develop better predictors, which will be tested out on a heldout private set to assess model generalization.
**How to submit?**
TODO
**How to evaluate?**
TODO
"""
)
with gr.TabItem("✉️ Submit", elem_id="boundary-benchmark-tab-table"):
gr.Markdown(
"""
# Antibody Developability Submission
Upload a CSV to get a score!
"""
)
filename = gr.State(value=None)
eval_state = gr.State(value=None)
user_state = gr.State(value=None)
# gr.LoginButton()
with gr.Row():
with gr.Column():
username_input = gr.Textbox(
label="Username",
placeholder="Enter your Hugging Face username",
info="This will be displayed on the leaderboard."
)
with gr.Column():
boundary_file = gr.File(label="Submission CSV")
username_input.change(
fn=lambda x: x if x.strip() else None,
inputs=username_input,
outputs=user_state
)
submit_btn = gr.Button("Evaluate")
message = gr.Textbox(label="Status", lines=1, visible=False)
# help message
gr.Markdown("If you have issues with submission or using the leaderboard, please start a discussion in the Community tab of this Space.")
submit_btn.click(
make_submission,
inputs=[boundary_file, user_state],
outputs=[message],
).then(
fn=show_output_box,
inputs=[message],
outputs=[message],
)
if __name__ == "__main__":
demo.launch()