File size: 3,517 Bytes
eb50e2e
 
 
0f3e1b5
 
 
 
 
 
eb50e2e
8f9985e
eb50e2e
 
 
8f9985e
eb50e2e
 
 
 
8f9985e
eb50e2e
 
 
 
8f9985e
eb50e2e
 
 
 
 
 
 
 
8f9985e
eb50e2e
8f9985e
eb50e2e
8f9985e
eb50e2e
 
 
 
 
 
845443f
 
eb50e2e
8f9985e
eb50e2e
 
 
8f9985e
eb50e2e
 
 
 
8f9985e
eb50e2e
 
 
 
 
 
 
8f9985e
 
0f3e1b5
 
 
 
 
 
 
 
 
 
eb50e2e
 
8f9985e
 
0f3e1b5
 
eb50e2e
 
8f9985e
 
4d9df8e
eb50e2e
8f9985e
 
4d9df8e
 
 
 
 
 
 
 
 
 
 
 
 
8f9985e
eb50e2e
 
 
 
8f9985e
eb50e2e
 
 
 
8f9985e
eb50e2e
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pandas as pd
import io
import gradio as gr
from constants import (
    REQUIRED_COLUMNS,
    MINIMAL_NUMBER_OF_ROWS,
    ANTIBODY_NAMES,
    ASSAY_LIST,
)


def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
    """
    Validate that the CSV file can be read and parsed.

    Parameters
    ----------
    file_content: str
        The content of the uploaded CSV file.

    Returns
    -------
    pd.DataFrame
        The parsed DataFrame if successful.

    Raises
    ------
    gr.Error: If CSV cannot be read or parsed
    """
    try:
        # Read CSV content
        df = pd.read_csv(io.StringIO(file_content))
        return df

    except pd.errors.EmptyDataError:
        raise gr.Error("❌ CSV file is empty or contains no valid data")
    except pd.errors.ParserError as e:
        raise gr.Error(f"❌ Invalid CSV format<br><br>" f"Error: {str(e)}")
    except UnicodeDecodeError:
        raise gr.Error(
            "❌ File encoding error<br><br>"
            "Your file appears to have an unsupported encoding.<br>"
            "Please save your CSV file with UTF-8 encoding and try again."
        )
    except Exception as e:
        raise gr.Error(f"❌ Unexpected error reading CSV file: {str(e)}")


def validate_dataframe(df: pd.DataFrame) -> None:
    """
    Validate the DataFrame content and structure.

    Parameters
    ----------
    df: pd.DataFrame
        The DataFrame to validate.

    Raises
    ------
    gr.Error: If validation fails
    """
    # Required columns should be present
    missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
    if missing_columns:
        raise gr.Error(f"❌ Missing required columns: {', '.join(missing_columns)}")

    # Should include at least 1 assay column
    assay_columns = [col for col in df.columns if col in ASSAY_LIST]
    if len(assay_columns) < 1:
        raise gr.Error(
            "❌ CSV should include at least one of the following assay columns: "
            + ", ".join(ASSAY_LIST)
        )
    # Submission are name, sequence, and at least one assay column
    submission_columns = REQUIRED_COLUMNS + assay_columns

    # Data should not be empty
    if df.empty:
        raise gr.Error("❌ CSV file is empty")

    # No missing values in submission columns
    for col in submission_columns:
        missing_count = df[col].isnull().sum()
        if missing_count > 0:
            raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")

    # Above minimal number of rows
    if len(df) < MINIMAL_NUMBER_OF_ROWS:
        raise gr.Error(f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows")

    # All names should be unique
    n_duplicates = df["antibody_name"].duplicated().sum()
    if n_duplicates > 0:
        raise gr.Error(
            f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates."
        )

    # All antibody names should be recognizable
    unrecognized_antibodies = set(df["antibody_name"]) - set(ANTIBODY_NAMES)
    if unrecognized_antibodies:
        raise gr.Error(
            f"❌ Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
        )


def validate_csv_file(file_content: str) -> None:
    """
    Validate the uploaded CSV file.

    Parameters
    ----------
    file_content: str
        The content of the uploaded CSV file.

    Raises
    ------
    gr.Error: If validation fails
    """
    df = validate_csv_can_be_read(file_content)
    validate_dataframe(df)