import pandas as pd import io import gradio as gr from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS def validate_csv_can_be_read(file_content: str) -> pd.DataFrame: """ Validate that the CSV file can be read and parsed. Parameters ---------- file_content: str The content of the uploaded CSV file. Returns ------- pd.DataFrame The parsed DataFrame if successful. Raises ------ gr.Error: If CSV cannot be read or parsed """ try: # Read CSV content df = pd.read_csv(io.StringIO(file_content)) return df except pd.errors.EmptyDataError: raise gr.Error( "❌ CSV file is empty or contains no valid data" ) except pd.errors.ParserError as e: raise gr.Error( f"❌ Invalid CSV format

" f"Error: {str(e)}" ) except UnicodeDecodeError: raise gr.Error( "❌ File encoding error

" "Your file appears to have an unsupported encoding.
" "Please save your CSV file with UTF-8 encoding and try again." ) def validate_dataframe(df: pd.DataFrame) -> None: """ Validate the DataFrame content and structure. Parameters ---------- df: pd.DataFrame The DataFrame to validate. Raises ------ gr.Error: If validation fails """ # Required columns should be present missing_columns = set(REQUIRED_COLUMNS) - set(df.columns) if missing_columns: raise gr.Error( f"❌ Missing required columns: {', '.join(missing_columns)}" ) # Data should not be empty if df.empty: raise gr.Error( "❌ CSV file is empty" ) # Check for missing values in required columns for col in REQUIRED_COLUMNS: missing_count = df[col].isnull().sum() if missing_count > 0: raise gr.Error( f"❌ Column '{col}' contains {missing_count} missing values" ) # Check for reasonable number of rows if len(df) < MINIMAL_NUMBER_OF_ROWS: raise gr.Error( f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows" ) print(f"✅ CSV validation passed! Found {len(df)} rows with columns: {', '.join(df.columns)}") def validate_csv_file(file_content: str) -> None: """ Validate the uploaded CSV file. Parameters ---------- file_content: str The content of the uploaded CSV file. Raises ------ gr.Error: If validation fails """ df = validate_csv_can_be_read(file_content) validate_dataframe(df)