abdev-leaderboard

Running

App Files Files Community

abdev-leaderboard / validation.py

loodvanniekerkginkgo

Double button working (it seems). Just need to adjust language on the

672339b about 1 month ago

raw

history blame

9.35 kB

	import pandas as pd
	import io
	import gradio as gr
	import requests
	from constants import (
	REQUIRED_COLUMNS,
	ASSAY_LIST,
	CV_COLUMN,
	SEQUENCES_FILE_DICT,
	GDPa1_path,
	)
	from evaluation import evaluate


	def validate_username(username: str) -> bool:
	"""
	Validate that the username corresponds to a real Hugging Face profile.
	Just check https://huggingface.co/username exists.

	Parameters
	----------
	username: str
	The username to validate

	Returns
	-------
	bool
	True if the username is valid and profile exists, False otherwise

	Raises
	------
	gr.Error: If username is invalid or profile doesn't exist
	"""
	username = username.strip()
	if username == "":
	raise gr.Error("❌ Please provide a username.")

	# Check if the Hugging Face profile exists
	profile_url = f"https://huggingface.co/{username}"

	try:
	response = requests.get(profile_url, timeout=10)

	if response.status_code == 200:
	# Additional check: make sure it's actually a user profile page
	# and not some other page that happens to exist
	if "profile" in response.text.lower() or "models" in response.text.lower():
	return True
	else:
	raise gr.Error(
	f"❌ '{username}' does not appear to be a valid Hugging Face user profile"
	)
	elif response.status_code == 404:
	raise gr.Error(
	f"❌ Hugging Face user '{username}' does not exist. Please check the username or create an account at https://huggingface.co. This is used to track unique submissions."
	)
	else:
	raise gr.Error(
	f"❌ Unable to verify username '{username}'. Please try again later."
	)

	except requests.exceptions.Timeout:
	raise gr.Error("❌ Timeout while checking username. Please try again.")
	except requests.exceptions.ConnectionError:
	raise gr.Error(
	"❌ Unable to connect to Hugging Face. Please check your internet connection."
	)
	except requests.exceptions.RequestException as e:
	raise gr.Error(f"❌ Error validating username: {str(e)}")


	def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
	"""
	Validate that the CSV file can be read and parsed.

	Parameters
	----------
	file_content: str
	The content of the uploaded CSV file.

	Returns
	-------
	pd.DataFrame
	The parsed DataFrame if successful.

	Raises
	------
	gr.Error: If CSV cannot be read or parsed
	"""
	try:
	# Read CSV content
	df = pd.read_csv(io.StringIO(file_content))
	return df

	except pd.errors.EmptyDataError:
	raise gr.Error("❌ CSV file is empty or contains no valid data")
	except pd.errors.ParserError as e:
	raise gr.Error(f"❌ Invalid CSV format<br><br>" f"Error: {str(e)}")
	except UnicodeDecodeError:
	raise gr.Error(
	"❌ File encoding error<br><br>"
	"Your file appears to have an unsupported encoding.<br>"
	"Please save your CSV file with UTF-8 encoding and try again."
	)
	except Exception as e:
	raise gr.Error(f"❌ Unexpected error reading CSV file: {str(e)}")


	def validate_cv_submission(
	df: pd.DataFrame, submission_type: str = "GDPa1_cross_validation"
	) -> None:
	"""Validate cross-validation submission"""
	# Must have CV_COLUMN for CV submissions
	if CV_COLUMN not in df.columns:
	raise gr.Error(f"❌ CV submissions must include a '{CV_COLUMN}' column")

	# Load canonical fold assignments
	expected_cv_df = pd.read_csv(SEQUENCES_FILE_DICT[submission_type])[
	["antibody_name", CV_COLUMN]
	]
	antibody_check = expected_cv_df.merge(
	df[["antibody_name", CV_COLUMN]],
	on="antibody_name",
	how="left",
	suffixes=("_expected", "_submitted"),
	)
	# CV fold assignments should match
	fold_mismatches = antibody_check[
	antibody_check[f"{CV_COLUMN}_expected"]
	!= antibody_check[f"{CV_COLUMN}_submitted"]
	]
	if len(fold_mismatches) > 0:
	examples = []
	for _, row in fold_mismatches.head(3).iterrows():
	examples.append(
	f"{row['antibody_name']} (expected fold {row[f'{CV_COLUMN}_expected']}, got {row[f'{CV_COLUMN}_submitted']})"
	)
	raise gr.Error(
	f"❌ Fold assignments don't match canonical CV folds: {'; '.join(examples)}"
	)


	def validate_full_dataset_submission(df: pd.DataFrame) -> None:
	"""Validate full dataset submission"""
	if CV_COLUMN in df.columns:
	raise gr.Error(
	f"❌ Your submission contains a '{CV_COLUMN}' column. "
	"Please select 'Cross-Validation Predictions' if you want to submit CV results."
	)


	def get_assay_columns(df: pd.DataFrame) -> list[str]:
	"""Get all assay columns from the DataFrame"""
	return [col for col in df.columns if col in ASSAY_LIST]


	def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None:
	"""
	Validate the DataFrame content and structure.

	Parameters
	----------
	df: pd.DataFrame
	The DataFrame to validate.
	submission_type: str
	Type of submission: "GDPa1" or "GDPa1_cross_validation"

	Raises
	------
	gr.Error: If validation fails
	"""
	if submission_type not in SEQUENCES_FILE_DICT.keys():
	raise ValueError(f"Invalid submission type: {submission_type}")

	# Required columns should be present
	missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
	if missing_columns:
	raise gr.Error(f"❌ Missing required columns: {', '.join(missing_columns)}")

	# Should include at least 1 assay column
	assay_columns = get_assay_columns(df)
	if len(assay_columns) < 1:
	raise gr.Error(
	f"❌ CSV should include at least one of the following assay columns: {', '.join(ASSAY_LIST)}. Found columns: {', '.join(df.columns)}"
	)
	# Submission are name, sequence, and at least one assay column
	submission_columns = REQUIRED_COLUMNS + assay_columns

	# Data should not be empty
	if df.empty:
	raise gr.Error("❌ CSV file is empty")

	# No missing values in submission columns
	for col in submission_columns:
	missing_count = df[col].isnull().sum()
	if missing_count > 0:
	raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")

	# All names should be unique
	n_duplicates = df["antibody_name"].duplicated().sum()
	if n_duplicates > 0:
	raise gr.Error(
	f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates."
	)

	example_df = pd.read_csv(SEQUENCES_FILE_DICT[submission_type])
	# All antibody names should be recognizable
	unrecognized_antibodies = set(df["antibody_name"]) - set(
	example_df["antibody_name"].tolist()
	)
	if unrecognized_antibodies:
	raise gr.Error(
	f"❌ Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
	)

	# All antibody names should be present
	# Note(Lood): Technically we could check that the antibodies are present just for the property that needs to be predicted
	missing_antibodies = set(example_df["antibody_name"].tolist()) - set(
	df["antibody_name"]
	)
	if missing_antibodies:
	raise gr.Error(
	f"❌ Missing predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies)}"
	)
	# Submission-type specific validation
	if submission_type.endswith("_cross_validation"):
	validate_cv_submission(df, submission_type)
	else: # full_dataset
	validate_full_dataset_submission(df)

	# Check Spearman correlations on public set
	df_gdpa1 = pd.read_csv(GDPa1_path)
	if submission_type in ["GDPa1", "GDPa1_cross_validation"]:
	results_df = evaluate(
	predictions_df=df, target_df=df_gdpa1, dataset_name=submission_type
	)
	# Check that the Spearman correlations are not too high
	if results_df["spearman"].max() > 0.9:
	raise gr.Error(
	message="⚠️ Your submission shows abnormally high correlations (>0.9) on the public set. "
	"Please check that you're not overfitting/don't have data leakage on the public set and are using cross-validation if training a new model.\n"
	"This will result in a better model for eventually submitting to the heldout test set.\n"
	"If you think this is a mistake, please contact [email protected].",
	duration=30,
	title="Data Leakage Warning",
	)


	def validate_csv_file(file_content: str, submission_type: str = "GDPa1") -> None:
	"""
	Validate the uploaded CSV file.

	Parameters
	----------
	file_content: str
	The content of the uploaded CSV file.
	submission_type: str
	Type of submission: "GDPa1" or "GDPa1_cross_validation"

	Raises
	------
	gr.Error: If validation fails
	"""
	df = validate_csv_can_be_read(file_content)
	validate_dataframe(df, submission_type)