abdev-leaderboard / validation.py
pquintero's picture
validate dataframe with tests
eb50e2e
raw
history blame
2.73 kB
import pandas as pd
import io
import gradio as gr
from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
"""
Validate that the CSV file can be read and parsed.
Parameters
----------
file_content: str
The content of the uploaded CSV file.
Returns
-------
pd.DataFrame
The parsed DataFrame if successful.
Raises
------
gr.Error: If CSV cannot be read or parsed
"""
try:
# Read CSV content
df = pd.read_csv(io.StringIO(file_content))
return df
except pd.errors.EmptyDataError:
raise gr.Error(
"❌ CSV file is empty or contains no valid data"
)
except pd.errors.ParserError as e:
raise gr.Error(
f"❌ Invalid CSV format<br><br>"
f"Error: {str(e)}"
)
except UnicodeDecodeError:
raise gr.Error(
"❌ File encoding error<br><br>"
"Your file appears to have an unsupported encoding.<br>"
"Please save your CSV file with UTF-8 encoding and try again."
)
def validate_dataframe(df: pd.DataFrame) -> None:
"""
Validate the DataFrame content and structure.
Parameters
----------
df: pd.DataFrame
The DataFrame to validate.
Raises
------
gr.Error: If validation fails
"""
# Required columns should be present
missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
if missing_columns:
raise gr.Error(
f"❌ Missing required columns: {', '.join(missing_columns)}"
)
# Data should not be empty
if df.empty:
raise gr.Error(
"❌ CSV file is empty"
)
# Check for missing values in required columns
for col in REQUIRED_COLUMNS:
missing_count = df[col].isnull().sum()
if missing_count > 0:
raise gr.Error(
f"❌ Column '{col}' contains {missing_count} missing values"
)
# Check for reasonable number of rows
if len(df) < MINIMAL_NUMBER_OF_ROWS:
raise gr.Error(
f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows"
)
print(f"βœ… CSV validation passed! Found {len(df)} rows with columns: {', '.join(df.columns)}")
def validate_csv_file(file_content: str) -> None:
"""
Validate the uploaded CSV file.
Parameters
----------
file_content: str
The content of the uploaded CSV file.
Raises
------
gr.Error: If validation fails
"""
df = validate_csv_can_be_read(file_content)
validate_dataframe(df)