import pandas as pd
import io
import gradio as gr
import requests
from constants import (
    REQUIRED_COLUMNS,
    ASSAY_LIST,
    CV_COLUMN,
    EXAMPLE_FILE_DICT,
    ANTIBODY_NAMES_DICT,
)
def validate_username(username: str) -> bool:
    """
    Validate that the username corresponds to a real Hugging Face profile.
    Just check https://huggingface.co/username exists.
    Parameters
    ----------
    username: str
        The username to validate
    Returns
    -------
    bool
        True if the username is valid and profile exists, False otherwise
    Raises
    ------
    gr.Error: If username is invalid or profile doesn't exist
    """
    username = username.strip()
    # Check if the Hugging Face profile exists
    profile_url = f"https://huggingface.co/{username}"
    try:
        response = requests.get(profile_url, timeout=10)
        if response.status_code == 200:
            # Additional check: make sure it's actually a user profile page
            # and not some other page that happens to exist
            if "profile" in response.text.lower() or "models" in response.text.lower():
                return True
            else:
                raise gr.Error(
                    f"❌ '{username}' does not appear to be a valid Hugging Face user profile"
                )
        elif response.status_code == 404:
            raise gr.Error(
                f"❌ Hugging Face user '{username}' does not exist. Please check the username or create an account at https://huggingface.co. This is used to track unique submissions."
            )
        else:
            raise gr.Error(
                f"❌ Unable to verify username '{username}'. Please try again later."
            )
    except requests.exceptions.Timeout:
        raise gr.Error("❌ Timeout while checking username. Please try again.")
    except requests.exceptions.ConnectionError:
        raise gr.Error(
            "❌ Unable to connect to Hugging Face. Please check your internet connection."
        )
    except requests.exceptions.RequestException as e:
        raise gr.Error(f"❌ Error validating username: {str(e)}")
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
    """
    Validate that the CSV file can be read and parsed.
    Parameters
    ----------
    file_content: str
        The content of the uploaded CSV file.
    Returns
    -------
    pd.DataFrame
        The parsed DataFrame if successful.
    Raises
    ------
    gr.Error: If CSV cannot be read or parsed
    """
    try:
        # Read CSV content
        df = pd.read_csv(io.StringIO(file_content))
        return df
    except pd.errors.EmptyDataError:
        raise gr.Error("❌ CSV file is empty or contains no valid data")
    except pd.errors.ParserError as e:
        raise gr.Error(f"❌ Invalid CSV format
" f"Error: {str(e)}")
    except UnicodeDecodeError:
        raise gr.Error(
            "❌ File encoding error
"
            "Your file appears to have an unsupported encoding.
"
            "Please save your CSV file with UTF-8 encoding and try again."
        )
    except Exception as e:
        raise gr.Error(f"❌ Unexpected error reading CSV file: {str(e)}")
def validate_cv_submission(
    df: pd.DataFrame, submission_type: str = "GDPa1_cross_validation"
) -> None:
    """Validate cross-validation submission"""
    # Must have CV_COLUMN for CV submissions
    if CV_COLUMN not in df.columns:
        raise gr.Error(f"❌ CV submissions must include a '{CV_COLUMN}' column")
    # Load canonical fold assignments
    expected_cv_df = pd.read_csv(EXAMPLE_FILE_DICT[submission_type])[
        ["antibody_name", CV_COLUMN]
    ]
    antibody_check = expected_cv_df.merge(
        df[["antibody_name", CV_COLUMN]],
        on="antibody_name",
        how="left",
        suffixes=("_expected", "_submitted"),
    )
    # CV fold assignments should match
    fold_mismatches = antibody_check[
        antibody_check[f"{CV_COLUMN}_expected"]
        != antibody_check[f"{CV_COLUMN}_submitted"]
    ]
    if len(fold_mismatches) > 0:
        examples = []
        for _, row in fold_mismatches.head(3).iterrows():
            examples.append(
                f"{row['antibody_name']} (expected fold {row[f'{CV_COLUMN}_expected']}, got {row[f'{CV_COLUMN}_submitted']})"
            )
        raise gr.Error(
            f"❌ Fold assignments don't match canonical CV folds: {'; '.join(examples)}"
        )
def validate_full_dataset_submission(df: pd.DataFrame) -> None:
    """Validate full dataset submission"""
    if CV_COLUMN in df.columns:
        raise gr.Error(
            f"❌ Your submission contains a '{CV_COLUMN}' column. "
            "Please select 'Cross-Validation Predictions' if you want to submit CV results."
        )
def get_assay_columns(df: pd.DataFrame) -> list[str]:
    """Get all assay columns from the DataFrame"""
    return [col for col in df.columns if col in ASSAY_LIST]
def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None:
    """
    Validate the DataFrame content and structure.
    Parameters
    ----------
    df: pd.DataFrame
        The DataFrame to validate.
    submission_type: str
        Type of submission: "GDPa1" or "GDPa1_cross_validation"
    Raises
    ------
    gr.Error: If validation fails
    """
    if submission_type not in EXAMPLE_FILE_DICT.keys():
        raise ValueError(f"Invalid submission type: {submission_type}")
    # Required columns should be present
    missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
    if missing_columns:
        raise gr.Error(f"❌ Missing required columns: {', '.join(missing_columns)}")
    # Should include at least 1 assay column
    assay_columns = get_assay_columns(df)
    if len(assay_columns) < 1:
        raise gr.Error(
            "❌ CSV should include at least one of the following assay columns: "
            + ", ".join(ASSAY_LIST)
        )
    # Submission are name, sequence, and at least one assay column
    submission_columns = REQUIRED_COLUMNS + assay_columns
    # Data should not be empty
    if df.empty:
        raise gr.Error("❌ CSV file is empty")
    # No missing values in submission columns
    for col in submission_columns:
        missing_count = df[col].isnull().sum()
        if missing_count > 0:
            raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")
    # All names should be unique
    n_duplicates = df["antibody_name"].duplicated().sum()
    if n_duplicates > 0:
        raise gr.Error(
            f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates."
        )
    # All antibody names should be recognizable
    unrecognized_antibodies = set(df["antibody_name"]) - set(
        ANTIBODY_NAMES_DICT[submission_type]
    )
    if unrecognized_antibodies:
        raise gr.Error(
            f"❌ Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
        )
    # All antibody names should be present
    missing_antibodies = set(ANTIBODY_NAMES_DICT[submission_type]) - set(
        df["antibody_name"]
    )
    if missing_antibodies:
        raise gr.Error(
            f"❌ Missing predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies)}"
        )
    # Submission-type specific validation
    if submission_type.endswith("_cross_validation"):
        validate_cv_submission(df, submission_type)
    else:  # full_dataset
        validate_full_dataset_submission(df)
def validate_csv_file(file_content: str, submission_type: str = "GDPa1") -> None:
    """
    Validate the uploaded CSV file.
    Parameters
    ----------
    file_content: str
        The content of the uploaded CSV file.
    submission_type: str
        Type of submission: "GDPa1" or "GDPa1_cross_validation"
    Raises
    ------
    gr.Error: If validation fails
    """
    df = validate_csv_can_be_read(file_content)
    validate_dataframe(df, submission_type)