|
|
import pandas as pd |
|
|
import io |
|
|
import gradio as gr |
|
|
from constants import ( |
|
|
REQUIRED_COLUMNS, |
|
|
MINIMAL_NUMBER_OF_ROWS, |
|
|
ANTIBODY_NAMES, |
|
|
ASSAY_LIST, |
|
|
) |
|
|
|
|
|
|
|
|
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame: |
|
|
""" |
|
|
Validate that the CSV file can be read and parsed. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
file_content: str |
|
|
The content of the uploaded CSV file. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
pd.DataFrame |
|
|
The parsed DataFrame if successful. |
|
|
|
|
|
Raises |
|
|
------ |
|
|
gr.Error: If CSV cannot be read or parsed |
|
|
""" |
|
|
try: |
|
|
|
|
|
df = pd.read_csv(io.StringIO(file_content)) |
|
|
return df |
|
|
|
|
|
except pd.errors.EmptyDataError: |
|
|
raise gr.Error("β CSV file is empty or contains no valid data") |
|
|
except pd.errors.ParserError as e: |
|
|
raise gr.Error(f"β Invalid CSV format<br><br>" f"Error: {str(e)}") |
|
|
except UnicodeDecodeError: |
|
|
raise gr.Error( |
|
|
"β File encoding error<br><br>" |
|
|
"Your file appears to have an unsupported encoding.<br>" |
|
|
"Please save your CSV file with UTF-8 encoding and try again." |
|
|
) |
|
|
except Exception as e: |
|
|
raise gr.Error(f"β Unexpected error reading CSV file: {str(e)}") |
|
|
|
|
|
|
|
|
def validate_dataframe(df: pd.DataFrame) -> None: |
|
|
""" |
|
|
Validate the DataFrame content and structure. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
df: pd.DataFrame |
|
|
The DataFrame to validate. |
|
|
|
|
|
Raises |
|
|
------ |
|
|
gr.Error: If validation fails |
|
|
""" |
|
|
|
|
|
missing_columns = set(REQUIRED_COLUMNS) - set(df.columns) |
|
|
if missing_columns: |
|
|
raise gr.Error(f"β Missing required columns: {', '.join(missing_columns)}") |
|
|
|
|
|
|
|
|
assay_columns = [col for col in df.columns if col in ASSAY_LIST] |
|
|
if len(assay_columns) < 1: |
|
|
raise gr.Error( |
|
|
"β CSV should include at least one of the following assay columns: " |
|
|
+ ", ".join(ASSAY_LIST) |
|
|
) |
|
|
|
|
|
submission_columns = REQUIRED_COLUMNS + assay_columns |
|
|
|
|
|
|
|
|
if df.empty: |
|
|
raise gr.Error("β CSV file is empty") |
|
|
|
|
|
|
|
|
for col in submission_columns: |
|
|
missing_count = df[col].isnull().sum() |
|
|
if missing_count > 0: |
|
|
raise gr.Error(f"β Column '{col}' contains {missing_count} missing values") |
|
|
|
|
|
|
|
|
if len(df) < MINIMAL_NUMBER_OF_ROWS: |
|
|
raise gr.Error(f"β CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows") |
|
|
|
|
|
|
|
|
n_duplicates = df["antibody_name"].duplicated().sum() |
|
|
if n_duplicates > 0: |
|
|
raise gr.Error( |
|
|
f"β CSV should have only one row per antibody. Found {n_duplicates} duplicates." |
|
|
) |
|
|
|
|
|
|
|
|
unrecognized_antibodies = set(df["antibody_name"]) - set(ANTIBODY_NAMES) |
|
|
if unrecognized_antibodies: |
|
|
raise gr.Error( |
|
|
f"β Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}" |
|
|
) |
|
|
|
|
|
|
|
|
def validate_csv_file(file_content: str) -> None: |
|
|
""" |
|
|
Validate the uploaded CSV file. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
file_content: str |
|
|
The content of the uploaded CSV file. |
|
|
|
|
|
Raises |
|
|
------ |
|
|
gr.Error: If validation fails |
|
|
""" |
|
|
df = validate_csv_can_be_read(file_content) |
|
|
validate_dataframe(df) |
|
|
|