File size: 2,725 Bytes
eb50e2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd
import io
import gradio as gr
from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS

def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
    """
    Validate that the CSV file can be read and parsed.
    
    Parameters
    ----------
    file_content: str
        The content of the uploaded CSV file.
    
    Returns
    -------
    pd.DataFrame
        The parsed DataFrame if successful.
    
    Raises
    ------
    gr.Error: If CSV cannot be read or parsed
    """
    try:
        # Read CSV content
        df = pd.read_csv(io.StringIO(file_content))
        return df
        
    except pd.errors.EmptyDataError:
        raise gr.Error(
            "❌ CSV file is empty or contains no valid data"
        )
    except pd.errors.ParserError as e:
        raise gr.Error(
            f"❌ Invalid CSV format<br><br>"
            f"Error: {str(e)}"
        )
    except UnicodeDecodeError:
        raise gr.Error(
            "❌ File encoding error<br><br>"
            "Your file appears to have an unsupported encoding.<br>"
            "Please save your CSV file with UTF-8 encoding and try again."
        )

def validate_dataframe(df: pd.DataFrame) -> None:
    """
    Validate the DataFrame content and structure.
    
    Parameters
    ----------
    df: pd.DataFrame
        The DataFrame to validate.
    
    Raises
    ------
    gr.Error: If validation fails
    """
    # Required columns should be present
    missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
    if missing_columns:
        raise gr.Error(
            f"❌ Missing required columns: {', '.join(missing_columns)}"
        )
    
    # Data should not be empty
    if df.empty:
        raise gr.Error(
            "❌ CSV file is empty"
        )
    
    # Check for missing values in required columns
    for col in REQUIRED_COLUMNS:
        missing_count = df[col].isnull().sum()
        if missing_count > 0:
            raise gr.Error(
                f"❌ Column '{col}' contains {missing_count} missing values"
            )
    
    # Check for reasonable number of rows
    if len(df) < MINIMAL_NUMBER_OF_ROWS:
        raise gr.Error(
            f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows"
        )
    
    print(f"βœ… CSV validation passed! Found {len(df)} rows with columns: {', '.join(df.columns)}")

def validate_csv_file(file_content: str) -> None:
    """
    Validate the uploaded CSV file.
    
    Parameters
    ----------
    file_content: str
        The content of the uploaded CSV file.
    
    Raises
    ------
    gr.Error: If validation fails
    """
    df = validate_csv_can_be_read(file_content)
    validate_dataframe(df)