Spaces:

alidenewade
/

drug-discovery-app

Sleeping

App Files Files Community

alidenewade commited on Jun 9

Commit

0732634

verified ·

1 Parent(s): 28d310c

Update app.py

Browse files

Files changed (1) hide show

app.py +412 -893

app.py CHANGED Viewed

@@ -1,69 +1,70 @@
 # --- IMPORTS ---
 # Core and Data Handling
-import gradio as gr
-import pandas as pd
-import numpy as np
-import os
-import glob
-import time
-import warnings
 # Chemistry and Cheminformatics
-from rdkit import Chem
-from rdkit.Chem import Descriptors, Lipinski
-from chembl_webresource_client.new_client import new_client
-from padelpy import padeldescriptor
-# import mols2grid # This line will be removed
 # Plotting and Visualization
-import matplotlib.pyplot as plt
-import seaborn as sns
-from scipy import stats
-from scipy.stats import mannwhitneyu
 # Machine Learning Models and Metrics
-from sklearn.model_selection import train_test_split
-from sklearn.feature_selection import VarianceThreshold
-from sklearn.linear_model import (
-    LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge,
-    HuberRegressor, PassiveAggressiveRegressor, OrthogonalMatchingPursuit,
-    LassoLars
 )
-from sklearn.tree import DecisionTreeRegressor
-from sklearn.ensemble import (
-    RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor,
-    AdaBoostRegressor
 )
-from sklearn.neighbors import KNeighborsRegressor
-from sklearn.dummy import DummyRegressor
-from sklearn.metrics import (
-    mean_absolute_error, mean_squared_error, r2_score
 )
 # A placeholder class to store all results from a modeling run
-class ModelRunResult:
-    def __init__(self, dataframe, plotter, models, selected_features):
-        self.dataframe = dataframe
-        self.plotter = plotter
-        self.models = models
-        self.selected_features = selected_features
 # Optional Advanced Models
-try:
-    import xgboost as xgb
-    import lightgbm as lgb
-    import catboost as cb
-    _has_extra_libs = True
-except ImportError:
-    _has_extra_libs = False
-    warnings.warn("Optional libraries (xgboost, lightgbm, catboost) not found. Some models will be unavailable.")
 # --- GLOBAL CONFIGURATION & SETUP ---
-warnings.filterwarnings("ignore")
-sns.set_theme(style='whitegrid')
 # --- FINGERPRINT CONFIGURATION ---
 DESCRIPTOR_DIR = "padel_descriptors"
 # Check if the descriptor directory exists and contains files
 if not os.path.isdir(DESCRIPTOR_DIR):
     warnings.warn(
@@ -73,893 +74,411 @@ if not os.path.isdir(DESCRIPTOR_DIR):
     xml_files = []
 else:
     xml_files = sorted(glob.glob(os.path.join(DESCRIPTOR_DIR, '*.xml')))
-    if not xml_files:
-        warnings.warn(
-            f"No descriptor .xml files found in the '{DESCRIPTOR_DIR}' directory. "
-            "Fingerprint calculation will not be possible."
-        )
 # The key is the filename without extension; the value is the full path to the file
 fp_config = {os.path.splitext(os.path.basename(file))[0]: file for file in xml_files}
 FP_list = sorted(list(fp_config.keys()))
-# --- NEW MOLECULE DISPLAY FUNCTIONS ---
-def create_molecule_grid_html(df, smiles_col='canonical_smiles', additional_cols=None, max_mols=50):
-    """
-    Create a custom HTML grid for displaying molecules using RDKit and base64 encoding
-    This works better in Hugging Face Spaces than mols2grid
-    """
-    from rdkit import Chem
-    from rdkit.Chem import Draw
-    import base64
-    from io import BytesIO
-    if df.empty:
-        return "<h3>No molecules to display.</h3>"
-    # Limit number of molecules for performance
-    df_display = df.head(max_mols).copy()
-    # Additional columns to display
-    if additional_cols is None:
-        additional_cols = []
-    html_parts = ["""
-    <div style="display: grid; grid-template-columns: repeat(auto-fill, minmax(300px, 1fr)); gap: 15px; padding: 10px;">
-    """]
-    for idx, row in df_display.iterrows():
-        try:
-            # Generate molecule image
-            mol = Chem.MolFromSmiles(row[smiles_col])
-            if mol is None:
-                continue
-            # Draw molecule
-            img = Draw.MolToImage(mol, size=(250, 200))
-            # Convert to base64
-            buffer = BytesIO()
-            img.save(buffer, format='PNG')
-            img_str = base64.b64encode(buffer.getvalue()).decode()
-            # Create card HTML
-            card_html = f"""
-            <div style="border: 1px solid #ddd; border-radius: 8px; padding: 10px; background: white;">
-                <img src="data:image/png;base64,{img_str}" style="width: 100%; height: auto;" alt="Molecule"/>
-                <div style="margin-top: 8px; font-size: 12px;">
-                    <strong>SMILES:</strong> {row[smiles_col][:50]}{'...' if len(str(row[smiles_col])) > 50 else ''}<br/>
-            """
-            # Add additional columns
-            for col in additional_cols:
-                if col in row and pd.notna(row[col]):
-                    value = row[col]
-                    if isinstance(value, float):
-                        value = f"{value:.2f}"
-                    card_html += f"<strong>{col}:</strong> {value}<br/>"
-            card_html += """
-                </div>
-            </div>
-            """
-            html_parts.append(card_html)
-        except Exception as e:
-            print(f"Error processing molecule {idx}: {e}")
-            continue
-    html_parts.append("</div>")
-    if len(html_parts) == 2:  # Only header and footer, no molecules processed
-        return "<h3>No valid molecules to display.</h3>"
-    return "".join(html_parts)
-def create_simple_molecule_table(df, smiles_col='canonical_smiles', additional_cols=None, max_mols=20):
-    """
-    Create a simple HTML table with molecule images - fallback option
-    """
-    from rdkit import Chem
-    from rdkit.Chem import Draw
-    import base64
-    from io import BytesIO
-    if df.empty:
-        return "<h3>No molecules to display.</h3>"
-    df_display = df.head(max_mols).copy()
-    if additional_cols is None:
-        additional_cols = []
-    # Start HTML table
-    html = """
-    <table style="border-collapse: collapse; width: 100%;">
-        <thead>
-            <tr style="background-color: #f2f2f2;">
-                <th style="border: 1px solid #ddd; padding: 8px;">Structure</th>
-                <th style="border: 1px solid #ddd; padding: 8px;">SMILES</th>
-    """
-    for col in additional_cols:
-        html += f'<th style="border: 1px solid #ddd; padding: 8px;">{col}</th>'
-    html += """
-            </tr>
-        </thead>
-        <tbody>
-    """
-    for idx, row in df_display.iterrows():
-        try:
-            mol = Chem.MolFromSmiles(row[smiles_col])
-            if mol is None:
-                continue
-            # Generate image
-            img = Draw.MolToImage(mol, size=(200, 150))
-            buffer = BytesIO()
-            img.save(buffer, format='PNG')
-            img_str = base64.b64encode(buffer.getvalue()).decode()
-            html += f"""
-            <tr>
-                <td style="border: 1px solid #ddd; padding: 8px; text-align: center;">
-                    <img src="data:image/png;base64,{img_str}" style="max-width: 200px; height: auto;" alt="Molecule"/>
-                </td>
-                <td style="border: 1px solid #ddd; padding: 8px; font-family: monospace; font-size: 11px;">
-                    {row[smiles_col][:100]}{'...' if len(str(row[smiles_col])) > 100 else ''}
-                </td>
-            """
-            for col in additional_cols:
-                value = row[col] if col in row and pd.notna(row[col]) else "N/A"
-                if isinstance(value, float):
-                    value = f"{value:.2f}"
-                html += f'<td style="border: 1px solid #ddd; padding: 8px;">{value}</td>'
-            html += "</tr>"
-        except Exception as e:
-            print(f"Error processing molecule {idx}: {e}")
-            continue
-    html += "</tbody></table>"
-    return html
 # ==============================================================================
 # === STEP 1: CORE DATA COLLECTION & EDA FUNCTIONS ===
 # ==============================================================================
-def get_target_chembl_id(query):
-    try:
-        target = new_client.target
-        res = target.search(query)
-        if not res:
-            return pd.DataFrame(), gr.Dropdown(choices=[], value=None), "No targets found for your query."
-        df = pd.DataFrame(res)
-        return df[["target_chembl_id", "pref_name", "organism"]], gr.Dropdown(choices=df["target_chembl_id"].tolist()), f"Found {len(df)} targets."
-    except Exception as e:
-        raise gr.Error(f"ChEMBL search failed: {e}")
-def get_bioactivity_data(target_id):
-    try:
-        activity = new_client.activity
-        res = activity.filter(target_chembl_id=target_id).filter(standard_type="IC50")
-        if not res:
-            return pd.DataFrame(), "No IC50 bioactivity data found for this target."
-        df = pd.DataFrame(res)
-        return df, f"Fetched {len(df)} data points."
-    except Exception as e:
-        raise gr.Error(f"Failed to fetch bioactivity data: {e}")
-def pIC50_calc(input_df):
-    df_copy = input_df.copy()
-    df_copy['standard_value'] = pd.to_numeric(df_copy['standard_value'], errors='coerce')
-    df_copy.dropna(subset=['standard_value'], inplace=True)
-    df_copy['standard_value_norm'] = df_copy['standard_value'].apply(lambda x: min(x, 100000000))
-    pIC50_values = []
-    for i in df_copy['standard_value_norm']:
-        if pd.notna(i) and i > 0:
-            molar = i * (10**-9)
-            pIC50_values.append(-np.log10(molar))
-        else:
-            pIC50_values.append(np.nan)
-    df_copy['pIC50'] = pIC50_values
-    df_copy['bioactivity_class'] = df_copy['standard_value_norm'].apply(
-        lambda x: "inactive" if pd.notna(x) and x >= 10000 else ("active" if pd.notna(x) and x <= 1000 else "intermediate")
     )
-    return df_copy.drop(columns=['standard_value', 'standard_value_norm'])
-def lipinski_descriptors(smiles_series):
-    moldata, valid_smiles = [], []
-    for elem in smiles_series:
-        if elem and isinstance(elem, str):
-            mol = Chem.MolFromSmiles(elem)
-            if mol:
-                moldata.append(mol)
-                valid_smiles.append(elem)
-    descriptor_rows = []
-    for mol in moldata:
-        row = [Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Lipinski.NumHDonors(mol), Lipinski.NumHAcceptors(mol)]
-        descriptor_rows.append(row)
-    columnNames = ["MW", "LogP", "NumHDonors", "NumHAcceptors"]
-    if not descriptor_rows: return pd.DataFrame(columns=columnNames), []
-    return pd.DataFrame(data=np.array(descriptor_rows), columns=columnNames), valid_smiles
-def clean_and_process_data(df):
-    if df is None or df.empty: raise gr.Error("No data to process. Please fetch data first.")
-    if "canonical_smiles" not in df.columns or df["canonical_smiles"].isnull().all():
-        try:
-            df["canonical_smiles"] = [c.get("molecule_structures", {}).get("canonical_smiles") for c in new_client.molecule.get(list(df["molecule_chembl_id"]))]
-        except Exception as e:
-            raise gr.Error(f"Could not fetch SMILES from ChEMBL: {e}")
-    df = df[df.standard_value.notna()]
-    df = df[df.canonical_smiles.notna()]
-    df.drop_duplicates(['canonical_smiles'], inplace=True)
-    df["standard_value"] = pd.to_numeric(df["standard_value"], errors='coerce')
-    df.dropna(subset=['standard_value'], inplace=True)
-    df_processed = pIC50_calc(df)
-    df_processed = df_processed[df_processed.pIC50.notna()]
-    if df_processed.empty: return pd.DataFrame(), "No compounds remaining after pIC50 calculation."
-    df_lipinski, valid_smiles = lipinski_descriptors(df_processed['canonical_smiles'])
-    if not valid_smiles: return pd.DataFrame(), "No valid SMILES could be processed for Lipinski descriptors."
-    df_processed = df_processed[df_processed['canonical_smiles'].isin(valid_smiles)].reset_index(drop=True)
-    df_lipinski = df_lipinski.reset_index(drop=True)
-    df_final = pd.concat([df_processed, df_lipinski], axis=1)
-    return df_final, f"Processing complete. {len(df_final)} compounds remain after cleaning."
-def run_eda_analysis(df, selected_classes):
-    if df is None or df.empty: raise gr.Error("No data available for analysis.")
-    df_filtered = df[df.bioactivity_class.isin(selected_classes)].copy()
-    if df_filtered.empty: return (None, None, None, pd.DataFrame(), None, pd.DataFrame(), None, pd.DataFrame(), None, pd.DataFrame(), None, pd.DataFrame(), "No data for selected classes.")
-    plots = [create_frequency_plot(df_filtered), create_scatter_plot(df_filtered)]
-    stats_dfs = []
-    for desc in ['pIC50', 'MW', 'LogP', 'NumHDonors', 'NumHAcceptors']:
-        plots.append(create_boxplot(df_filtered, desc))
-        stats_dfs.append(mannwhitney_test(df_filtered, desc))
-    plt.close('all')
-    return (plots[0], plots[1], plots[2], stats_dfs[0], plots[3], stats_dfs[1], plots[4], stats_dfs[2], plots[5], stats_dfs[3], plots[6], stats_dfs[4], f"EDA complete for {len(df_filtered)} compounds.")
-def create_frequency_plot(df):
-    plt.figure(figsize=(5.5, 5.5)); sns.barplot(x=df['bioactivity_class'].value_counts().index, y=df['bioactivity_class'].value_counts().values, palette={'active': '#1f77b4', 'inactive': '#ff7f0e', 'intermediate': '#2ca02c'}); plt.xlabel('Bioactivity Class', fontsize=12); plt.ylabel('Frequency', fontsize=12); plt.title('Frequency of Bioactivity Classes', fontsize=14); return plt.gcf()
-def create_scatter_plot(df):
-    plt.figure(figsize=(5.5, 5.5)); sns.scatterplot(data=df, x='MW', y='LogP', hue='bioactivity_class', size='pIC50', palette={'active': '#1f77b4', 'inactive': '#ff7f0e', 'intermediate': '#2ca02c'}, sizes=(20, 200), alpha=0.7); plt.xlabel('Molecular Weight (MW)', fontsize=12); plt.ylabel('LogP', fontsize=12); plt.title('Chemical Space: MW vs. LogP', fontsize=14); plt.legend(title='Bioactivity Class'); return plt.gcf()
-def create_boxplot(df, descriptor):
-    plt.figure(figsize=(5.5, 5.5)); sns.boxplot(x='bioactivity_class', y=descriptor, data=df, palette={'active': '#1f77b4', 'inactive': '#ff7f0e', 'intermediate': '#2ca02c'}); plt.xlabel('Bioactivity Class', fontsize=12); plt.ylabel(descriptor, fontsize=12); plt.title(f'{descriptor} by Bioactivity Class', fontsize=14); return plt.gcf()
-def mannwhitney_test(df, descriptor):
-    results = []
-    for c1, c2 in [('active', 'inactive'), ('active', 'intermediate'), ('inactive', 'intermediate')]:
-        if c1 in df['bioactivity_class'].unique() and c2 in df['bioactivity_class'].unique():
-            d1, d2 = df[df.bioactivity_class == c1][descriptor].dropna(), df[df.bioactivity_class == c2][descriptor].dropna()
-            if not d1.empty and not d2.empty:
-                stat, p = mannwhitneyu(d1, d2)
-                results.append({'Comparison': f'{c1.title()} vs {c2.title()}', 'Statistics': stat, 'p-value': p, 'Interpretation': 'Different distribution (p < 0.05)' if p <= 0.05 else 'Same distribution (p > 0.05)'})
-    return pd.DataFrame(results)
 # ==============================================================================
 # === STEP 2: FEATURE ENGINEERING FUNCTIONS ===
 # ==============================================================================
-def calculate_fingerprints(current_state, fingerprint_type, progress=gr.Progress()):
-    input_df = current_state.get('cleaned_data')
-    if input_df is None or input_df.empty:
-        raise gr.Error("No cleaned data found. Please complete Step 1.")
-    if not fingerprint_type:
-        raise gr.Error("Please select a fingerprint type.")
-    progress(0, desc="Starting...")
-    yield f"🧪 Starting fingerprint calculation...", None, gr.update(visible=False), None, current_state
-    try:
-        smi_file, output_csv = 'molecule.smi', 'fingerprints.csv'
-        input_df[['canonical_smiles', 'canonical_smiles']].to_csv(smi_file, sep='\t', index=False, header=False)
-        if os.path.exists(output_csv):
-            os.remove(output_csv)
-        descriptortypes = fp_config.get(fingerprint_type)
-        if not descriptortypes:
-            raise gr.Error(f"Descriptor XML for '{fingerprint_type}' not found.")
-        progress(0.3, desc="⚗️ Running PaDEL...")
-        yield f"⚗️ Running PaDEL...", None, gr.update(visible=False), None, current_state
-        padeldescriptor(
-            mol_dir=smi_file,
-            d_file=output_csv,
-            descriptortypes=descriptortypes,
-            detectaromaticity=True,
-            standardizenitro=True,
-            standardizetautomers=True,
-            threads=-1,
-            removesalt=True,
-            log=False,
-            fingerprints=True
-        )
-        if not os.path.exists(output_csv) or os.path.getsize(output_csv) == 0:
-            raise gr.Error("PaDEL failed to produce an output file. Check molecule validity.")
-        progress(0.7, desc="📊 Processing results...")
-        yield "📊 Processing results...", None, gr.update(visible=False), None, current_state
-        df_X = pd.read_csv(output_csv).rename(columns={'Name': 'canonical_smiles'})
-        final_df = pd.merge(input_df[['canonical_smiles', 'pIC50']], df_X, on='canonical_smiles', how='inner')
-        current_state['fingerprint_data'] = final_df
-        current_state['fingerprint_type'] = fingerprint_type
-        progress(0.9, desc="🖼️ Generating molecule grid...")
-        # Use custom molecule display instead of mols2grid
-        try:
-            # Try the grid layout first
-            mols_html = create_molecule_grid_html(
-                final_df,
-                smiles_col='canonical_smiles',
-                additional_cols=['pIC50'],
-                max_mols=50
-            )
-        except Exception as e:
-            print(f"Grid layout failed: {e}, trying table layout...")
-            # Fallback to table layout
-            mols_html = create_simple_molecule_table(
-                final_df,
-                smiles_col='canonical_smiles',
-                additional_cols=['pIC50'],
-                max_mols=20
-            )
-        success_msg = f"✅ Success! Generated {len(df_X.columns) -1} descriptors for {len(final_df)} molecules."
-        progress(1, desc="Completed!")
-        yield success_msg, final_df, gr.update(visible=True), gr.update(value=mols_html, visible=True), current_state
-    except Exception as e:
-        raise gr.Error(f"Calculation failed: {e}")
-    finally:
-        if os.path.exists('molecule.smi'):
-            os.remove('molecule.smi')
-        if os.path.exists('fingerprints.csv'):
-            os.remove('fingerprints.csv')
 # ==============================================================================
-# === STEP 3: MODELING FUNCTIONS ===
 # ==============================================================================
-# Model definitions
-regression_models = {
-    "Linear Regression": LinearRegression,
-    "Ridge Regression": Ridge,
-    "Lasso Regression": Lasso,
-    "Elastic Net": ElasticNet,
-    "Bayesian Ridge": BayesianRidge,
-    "Huber Regressor": HuberRegressor,
-    "Passive Aggressive Regressor": PassiveAggressiveRegressor,
-    "Orthogonal Matching Pursuit": OrthogonalMatchingPursuit,
-    "Lasso Lars": LassoLars,
-    "Decision Tree Regressor": DecisionTreeRegressor,
-    "Random Forest Regressor": RandomForestRegressor,
-    "Gradient Boosting Regressor": GradientBoostingRegressor,
-    "Extra Trees Regressor": ExtraTreesRegressor,
-    "AdaBoost Regressor": AdaBoostRegressor,
-    "K-Neighbors Regressor": KNeighborsRegressor,
-    "Dummy Regressor (Mean)": DummyRegressor,
-}
-if _has_extra_libs:
-    regression_models.update({
-        "XGBoost Regressor": xgb.XGBRegressor,
-        "LightGBM Regressor": lgb.LGBMRegressor,
-        "CatBoost Regressor": cb.CatBoostRegressor,
-    })
-def handle_model_training(current_state, progress=gr.Progress()):
-    df_fp = current_state.get('fingerprint_data')
-    if df_fp is None or df_fp.empty:
-        raise gr.Error("No fingerprint data found. Please complete Step 2.")
-    yield "⚙️ Starting model training...", None, None, current_state
-    progress(0, desc="Starting model training...")
-    try:
-        X = df_fp.drop(columns=['canonical_smiles', 'pIC50'])
-        y = df_fp['pIC50']
-        if X.empty:
-            raise gr.Error("No features found for training. Check fingerprint calculation.")
-        # Remove features with zero variance
-        sel = VarianceThreshold(threshold=0.0)
-        X_filtered = sel.fit_transform(X)
-        selected_features = X.columns[sel.get_support()]
-        if len(selected_features) == 0:
-            raise gr.Error("No features with non-zero variance found. Cannot train models.")
-        X_filtered = pd.DataFrame(X_filtered, columns=selected_features)
-        X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.2, random_state=42)
-        results = []
-        trained_models = {}
-        for i, (name, model_class) in enumerate(regression_models.items()):
-            current_progress = (i + 1) / len(regression_models)
-            progress(current_progress, desc=f"Training {name}...")
-            yield f"Training {name}...", None, None, current_state
-            model = model_class()
-            model.fit(X_train, y_train)
-            y_pred = model.predict(X_test)
-            mae = mean_absolute_error(y_test, y_pred)
-            mse = mean_squared_error(y_test, y_pred)
-            r2 = r2_score(y_test, y_pred)
-            results.append({
-                "Model": name,
-                "MAE": mae,
-                "MSE": mse,
-                "R2": r2
-            })
-            trained_models[name] = model
-        results_df = pd.DataFrame(results)
-        # Store results in the state
-        current_state['model_results'] = ModelRunResult(
-            dataframe=results_df,
-            plotter=None, # No plot generated here directly, but could be added
-            models=trained_models,
-            selected_features=selected_features
-        )
-        current_state['X_train'] = X_train
-        current_state['y_train'] = y_train
-        current_state['X_test'] = X_test
-        current_state['y_test'] = y_test
-        progress(1, desc="Model training complete!")
-        yield "✅ Model training complete!", results_df, gr.update(choices=list(trained_models.keys()), value=list(trained_models.keys())[0]), current_state
-    except Exception as e:
-        raise gr.Error(f"Model training failed: {e}")
-def update_analysis_plots(model_name, feature_count, current_state):
-    model_run_results = current_state.get('model_results')
-    X_train = current_state.get('X_train')
-    y_train = current_state.get('y_train')
-    X_test = current_state.get('X_test')
-    y_test = current_state.get('y_test')
-    if not model_run_results or not model_name or X_test is None or y_test is None:
-        return None, None, None, None, "Please train models first."
-    model = model_run_results.models.get(model_name)
-    if model is None:
-        return None, None, None, None, f"Model '{model_name}' not found."
-    y_pred_test = model.predict(X_test)
-    y_pred_train = model.predict(X_train)
-    # Calculate R2, MAE, MSE for test set
-    r2_test = r2_score(y_test, y_pred_test)
-    mae_test = mean_absolute_error(y_test, y_pred_test)
-    mse_test = mean_squared_error(y_test, y_pred_test)
-    # Plot Y-obs vs Y-pred
-    fig_obs_pred, ax_obs_pred = plt.subplots(figsize=(6, 6))
-    sns.scatterplot(x=y_test, y=y_pred_test, ax=ax_obs_pred, alpha=0.7, color='blue', label='Test Data')
-    sns.scatterplot(x=y_train, y=y_pred_train, ax=ax_obs_pred, alpha=0.7, color='green', label='Train Data')
-    ax_obs_pred.plot([min(y_test.min(), y_train.min()), max(y_test.max(), y_train.max())],
-                     [min(y_test.min(), y_train.min()), max(y_test.max(), y_train.max())],
-                     color='red', linestyle='--', label='Ideal Prediction')
-    ax_obs_pred.set_xlabel("Observed pIC50", fontsize=12)
-    ax_obs_pred.set_ylabel("Predicted pIC50", fontsize=12)
-    ax_obs_pred.set_title(f"{model_name}: Observed vs. Predicted pIC50", fontsize=14)
-    ax_obs_pred.legend()
-    plt.close(fig_obs_pred)
-    # Plot Residuals
-    residuals = y_test - y_pred_test
-    fig_residuals, ax_residuals = plt.subplots(figsize=(6, 6))
-    sns.scatterplot(x=y_pred_test, y=residuals, ax=ax_residuals, alpha=0.7, color='purple')
-    ax_residuals.axhline(y=0, color='red', linestyle='--')
-    ax_residuals.set_xlabel("Predicted pIC50", fontsize=12)
-    ax_residuals.set_ylabel("Residuals (Observed - Predicted)", fontsize=12)
-    ax_residuals.set_title(f"{model_name}: Residuals Plot", fontsize=14)
-    plt.close(fig_residuals)
-    # Feature Importance Plot (if applicable)
-    fig_feature_importance = None
-    if hasattr(model, 'feature_importances_') and feature_count > 0:
-        feature_importances = pd.Series(model.feature_importances_, index=model_run_results.selected_features)
-        top_features = feature_importances.nlargest(feature_count)
-        fig_feature_importance, ax_fi = plt.subplots(figsize=(8, 6))
-        sns.barplot(x=top_features.values, y=top_features.index, ax=ax_fi, palette='viridis')
-        ax_fi.set_xlabel("Importance", fontsize=12)
-        ax_fi.set_ylabel("Feature", fontsize=12)
-        ax_fi.set_title(f"{model_name}: Top {feature_count} Feature Importances", fontsize=14)
-        plt.tight_layout()
-        plt.close(fig_feature_importance)
-    elif hasattr(model, 'coef_') and feature_count > 0 and model_name not in ["Dummy Regressor (Mean)", "K-Neighbors Regressor"]:
-        # For linear models, coefficients can be used as importance
-        feature_importances = pd.Series(model.coef_, index=model_run_results.selected_features)
-        top_features = feature_importances.abs().nlargest(feature_count) # Use absolute value for ranking
-        top_features_values = feature_importances[top_features.index] # Get actual signed values
-        fig_feature_importance, ax_fi = plt.subplots(figsize=(8, 6))
-        sns.barplot(x=top_features_values.values, y=top_features_values.index, ax=ax_fi, palette='coolwarm')
-        ax_fi.set_xlabel("Coefficient Value", fontsize=12)
-        ax_fi.set_ylabel("Feature", fontsize=12)
-        ax_fi.set_title(f"{model_name}: Top {feature_count} Feature Coefficients", fontsize=14)
-        plt.tight_layout()
-        plt.close(fig_feature_importance)
-    return fig_obs_pred, fig_residuals, fig_feature_importance, \
-           f"R2 (Test): {r2_test:.4f}, MAE (Test): {mae_test:.4f}, MSE (Test): {mse_test:.4f}", \
-           "Plots updated."
-# Updated prediction function
-def predict_on_upload(uploaded_file, model_name, current_state, progress=gr.Progress()):
-    if not uploaded_file:
-        raise gr.Error("Please upload a file.")
-    if not model_name:
-        raise gr.Error("Please select a trained model.")
-    model_run_results = current_state.get('model_results')
-    fingerprint_type = current_state.get('fingerprint_type')
-    if not model_run_results or not fingerprint_type:
-        raise gr.Error("Please run Steps 2 and 3 first.")
-    model = model_run_results.models.get(model_name)
-    selected_features = model_run_results.selected_features
-    if model is None:
-        raise gr.Error(f"Model '{model_name}' not found.")
-    smi_file, output_csv = 'predict.smi', 'predict_fp.csv'
-    try:
-        progress(0, desc="Reading & processing new molecules...")
-        yield "Reading uploaded file...", None, None
-        df_new = pd.read_csv(uploaded_file.name)
-        if 'canonical_smiles' not in df_new.columns:
-            raise gr.Error("CSV must contain a 'canonical_smiles' column.")
-        df_new = df_new.reset_index().rename(columns={'index': 'mol_id'})
-        padel_input = pd.DataFrame({
-            'smiles': df_new['canonical_smiles'],
-            'name': df_new['mol_id']
-        })
-        padel_input.to_csv(smi_file, sep='\t', index=False, header=False)
-        if os.path.exists(output_csv):
-            os.remove(output_csv)
-        progress(0.3, desc="Calculating fingerprints...")
-        yield "Calculating fingerprints for new molecules...", None, None
-        padeldescriptor(
-            mol_dir=smi_file,
-            d_file=output_csv,
-            descriptortypes=fp_config.get(fingerprint_type),
-            detectaromaticity=True,
-            standardizenitro=True,
-            threads=-1,
-            removesalt=True,
-            log=False,
-            fingerprints=True
-        )
-        if not os.path.exists(output_csv) or os.path.getsize(output_csv) == 0:
-            raise gr.Error("PaDEL calculation failed for the uploaded molecules.")
-        progress(0.7, desc="Aligning features and predicting...")
-        yield "Aligning features and predicting...", None, None
-        df_fp = pd.read_csv(output_csv).rename(columns={'Name': 'mol_id'})
-        X_new = df_fp.set_index('mol_id')
-        X_new_aligned = X_new.reindex(columns=selected_features, fill_value=0)[selected_features]
-        predictions = model.predict(X_new_aligned)
-        results_subset = pd.DataFrame({
-            'mol_id': X_new_aligned.index,
-            'predicted_pIC50': predictions
-        })
-        df_results = pd.merge(df_new, results_subset, on='mol_id', how='left')
-        progress(0.9, desc="Generating visualization...")
-        yield "Generating visualization...", None, None
-        df_grid_view = df_results.dropna(subset=['predicted_pIC50']).copy()
-        mols_html = "<h3>No molecules with successful predictions to display.</h3>"
-        if not df_grid_view.empty:
-            try:
-                # Use custom molecule display
-                mols_html = create_molecule_grid_html(
-                    df_grid_view,
-                    smiles_col='canonical_smiles',
-                    additional_cols=['predicted_pIC50'],
-                    max_mols=50
-                )
-            except Exception as e:
-                print(f"Grid layout failed: {e}, trying table layout...")
-                mols_html = create_simple_molecule_table(
-                    df_grid_view,
-                    smiles_col='canonical_smiles',
-                    additional_cols=['predicted_pIC50'],
-                    max_mols=20
-                )
-        progress(1, desc="Complete!")
-        yield "✅ Prediction complete.", df_results[['canonical_smiles', 'predicted_pIC50']], mols_html
-    finally:
-        if os.path.exists(smi_file):
-            os.remove(smi_file)
-        if os.path.exists(output_csv):
-            os.remove(output_csv)
 # ==============================================================================
-# === GRADIO INTERFACE LAYOUT ===
 # ==============================================================================
-with gr.Blocks(css=".container { max-width: 1200px; margin: auto; }") as demo:
-    app_state = gr.State({}) # Store application state (e.g., fetched data, trained models)
-    gr.Markdown("# 💊 Bioactivity Prediction App")
-    gr.Markdown("---")
-    with gr.Tabs():
-        with gr.TabItem("Step 1: Data Collection & EDA"):
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("## 1.1 ChEMBL Target Search")
-                    target_query = gr.Textbox(
-                        label="Enter target protein name (e.g., 'EGFR', 'acetylcholinesterase')",
-                        placeholder="EGFR"
-                    )
-                    search_target_btn = gr.Button("Search ChEMBL")
-                    target_output_df = gr.DataFrame(
-                        label="Search Results",
-                        headers=["target_chembl_id", "pref_name", "organism"],
-                        max_rows=5
-                    )
-                    status_step1_search = gr.Textbox(label="Status", interactive=False)
-                    gr.Markdown("## 1.2 Fetch Bioactivity Data")
-                    chembl_id_selector = gr.Dropdown(
-                        label="Select Target ChEMBL ID",
-                        choices=[], interactive=True
-                    )
-                    fetch_data_btn = gr.Button("Fetch Bioactivity Data (IC50)")
-                    bioactivity_output_df = gr.DataFrame(label="Raw Bioactivity Data", max_rows=5)
-                    status_step1_fetch = gr.Textbox(label="Status", interactive=False)
-                    gr.Markdown("## 1.3 Clean & Process Data")
-                    process_data_btn = gr.Button("Process & Calculate pIC50/Lipinski Descriptors")
-                    cleaned_data_output_df = gr.DataFrame(label="Cleaned & Processed Data", max_rows=5)
-                    status_step1_process_clean = gr.Textbox(label="Status", interactive=False)
-                    download_s1 = gr.DownloadButton("Download Cleaned Data (CSV)", visible=False, interactive=False)
-                with gr.Column():
-                    gr.Markdown("## 1.4 Exploratory Data Analysis (EDA)")
-                    bioactivity_class_selector = gr.CheckboxGroup(
-                        label="Select Bioactivity Classes for EDA",
-                        choices=["active", "inactive", "intermediate"],
-                        value=["active", "inactive", "intermediate"],
-                        interactive=True
-                    )
-                    run_eda_btn = gr.Button("Run EDA")
-                    status_step1_process = gr.Textbox(label="Status", interactive=False)
-                    gr.Markdown("### Bioactivity Class Frequency")
-                    freq_plot_output = gr.Plot(label="Bioactivity Class Frequency")
-                    gr.Markdown("### Chemical Space (MW vs LogP)")
-                    scatter_plot_output = gr.Plot(label="MW vs LogP Scatter Plot")
-                    gr.Markdown("### pIC50 Distribution")
-                    pic50_plot_output = gr.Plot(label="pIC50 Box Plot")
-                    pic50_stats_output = gr.DataFrame(label="pIC50 Mann-Whitney U Test", max_rows=5)
-                    gr.Markdown("### Molecular Weight (MW) Distribution")
-                    mw_plot_output = gr.Plot(label="MW Box Plot")
-                    mw_stats_output = gr.DataFrame(label="MW Mann-Whitney U Test", max_rows=5)
-                    gr.Markdown("### LogP Distribution")
-                    logp_plot_output = gr.Plot(label="LogP Box Plot")
-                    logp_stats_output = gr.DataFrame(label="LogP Mann-Whitney U Test", max_rows=5)
-                    gr.Markdown("### Hydrogen Donors Distribution")
-                    hdonors_plot_output = gr.Plot(label="Hydrogen Donors Box Plot")
-                    hdonors_stats_output = gr.DataFrame(label="Hydrogen Donors Mann-Whitney U Test", max_rows=5)
-                    gr.Markdown("### Hydrogen Acceptors Distribution")
-                    hacceptors_plot_output = gr.Plot(label="Hydrogen Acceptors Box Plot")
-                    hacceptors_stats_output = gr.DataFrame(label="Hydrogen Acceptors Mann-Whitney U Test", max_rows=5)
-        with gr.TabItem("Step 2: Feature Engineering (Fingerprints)"):
-            gr.Markdown("## 2.1 Calculate Molecular Fingerprints")
-            gr.Markdown(f"Available Fingerprint Types: {', '.join(FP_list)}")
-            fingerprint_dropdown = gr.Dropdown(
-                label="Select Fingerprint Type",
-                choices=FP_list,
-                interactive=True,
-                value=FP_list[0] if FP_list else None # Set default if available
-            )
-            calculate_fp_btn = gr.Button("Calculate Fingerprints")
-            status_step2 = gr.Textbox(label="Status", interactive=False)
-            output_df_s2 = gr.DataFrame(label="Fingerprint Data (First 5 rows, with pIC50)", max_rows=5)
-            download_s2 = gr.DownloadButton("Download Fingerprint Data (CSV)", visible=False, interactive=False)
-            # Using HTML component for custom molecule display
-            mols_grid_s2 = gr.HTML(label="Molecules with pIC50", visible=True)
-        with gr.TabItem("Step 3: Model Training & Evaluation"):
-            gr.Markdown("## 3.1 Train Regression Models")
-            train_models_btn = gr.Button("Train Models")
-            status_step3_train = gr.Textbox(label="Status", interactive=False)
-            model_results_df = gr.DataFrame(label="Model Performance Metrics", max_rows=10)
-            gr.Markdown("## 3.2 Model Analysis")
-            model_selector_s3 = gr.Dropdown(
-                label="Select Model for Detailed Analysis",
-                choices=[],
-                interactive=True
-            )
-            feature_count_s3 = gr.Slider(
-                minimum=0, maximum=50, step=1, value=10,
-                label="Number of Top Features to Display", interactive=True
-            )
-            model_metrics_summary = gr.Textbox(label="Selected Model Metrics (Test Set)", interactive=False)
-            obs_pred_plot = gr.Plot(label="Observed vs. Predicted pIC50")
-            residuals_plot = gr.Plot(label="Residuals Plot")
-            feature_importance_plot = gr.Plot(label="Feature Importance/Coefficients")
-        with gr.TabItem("Step 4: Prediction on New Data"):
-            gr.Markdown("## 4.1 Upload New Molecules & Predict")
-            upload_new_mols = gr.File(label="Upload CSV with 'canonical_smiles' column")
-            model_selector_s4 = gr.Dropdown(
-                label="Select Trained Model for Prediction",
-                choices=[],
-                interactive=True
-            )
-            predict_btn = gr.Button("Predict pIC50 for New Molecules")
-            status_step4 = gr.Textbox(label="Status", interactive=False)
-            predictions_output_df = gr.DataFrame(label="Predictions for New Molecules", max_rows=10)
-            download_s4 = gr.DownloadButton("Download Predictions (CSV)", visible=False, interactive=False)
-            # Using HTML component for custom molecule display
-            mols_grid_s4 = gr.HTML(label="New Molecules with Predicted pIC50", visible=True)
     # --- EVENT HANDLERS ---
-    # Step 1 Callbacks
-    search_target_btn.click(
-        fn=lambda query: (get_target_chembl_id(query), gr.update(visible=True)), # Return two values
-        inputs=target_query,
-        outputs=[target_output_df, chembl_id_selector, status_step1_search]
-    )
-    chembl_id_selector.change(
-        fn=lambda x: gr.update(value=x),
-        inputs=chembl_id_selector,
-        outputs=chembl_id_selector # This is just to ensure the value is updated
-    )
-    fetch_data_btn.click(
-        fn=lambda target_id: (get_bioactivity_data(target_id), gr.update(value=target_id)),
-        inputs=chembl_id_selector,
-        outputs=[bioactivity_output_df, status_step1_fetch]
-    ).then(
-        fn=lambda df: (df, df.copy()), # Pass the fetched df to the state
-        inputs=bioactivity_output_df,
-        outputs=[gr.State(value={}, key='raw_data'), app_state]
-    )
-    process_data_btn.click(
-        fn=lambda current_state: clean_and_process_data(current_state.get('raw_data')),
-        inputs=app_state,
-        outputs=[cleaned_data_output_df, status_step1_process_clean]
-    ).then(
-        fn=lambda df: (gr.update(visible=True), df), # Show download button and update state
-        inputs=cleaned_data_output_df,
-        outputs=[download_s1, gr.State(value={}, key='cleaned_data'), app_state]
-    )
-    @download_s1.click(inputs=app_state, outputs=download_s1, show_progress="hidden")
-    def download_handler_s1(current_state):
-        df_to_download = current_state.get('cleaned_data')
-        if df_to_download is None:
-            raise gr.Error("No data to download. Please process data first.")
-        # Create a dummy file path for Gradio to handle the download
-        file_path = "cleaned_data.csv"
-        df_to_download.to_csv(file_path, index=False)
-        return gr.File(file_path, visible=True)
-    run_eda_btn.click(
-        fn=lambda df, classes: run_eda_analysis(df, classes),
-        inputs=[cleaned_data_output_df, bioactivity_class_selector],
-        outputs=[freq_plot_output, scatter_plot_output, pic50_plot_output, pic50_stats_output,
-                 mw_plot_output, mw_stats_output, logp_plot_output, logp_stats_output,
-                 hdonors_plot_output, hdonors_stats_output, hacceptors_plot_output, hacceptors_stats_output,
-                 status_step1_process]
-    )
-    # Update EDA plots on filter change (if data is already processed)
-    bioactivity_class_selector.change(
-        fn=lambda current_state, selected_classes: run_eda_analysis(current_state.get('cleaned_data'), selected_classes),
-        inputs=[app_state, bioactivity_class_selector],
-        outputs=[freq_plot_output, scatter_plot_output, pic50_plot_output, pic50_stats_output,
-                 mw_plot_output, mw_stats_output, logp_plot_output, logp_stats_output,
-                 hdonors_plot_output, hdonors_stats_output, hacceptors_plot_output, hacceptors_stats_output,
-                 status_step1_process],
-        show_progress="minimal"
-    )
-    # Step 2 Callbacks
-    calculate_fp_btn.click(
-        fn=calculate_fingerprints,
-        inputs=[app_state, fingerprint_dropdown],
-        outputs=[status_step2, output_df_s2, download_s2, mols_grid_s2, app_state]
-    )
-    @download_s2.click(inputs=app_state, outputs=download_s2, show_progress="hidden")
-    def download_handler(current_state):
-        df_to_download = current_state.get('fingerprint_data')
-        if df_to_download is None:
-            raise gr.Error("No data to download. Please calculate fingerprints first.")
-        file_path = "fingerprint_data.csv"
-        df_to_download.to_csv(file_path, index=False)
-        return gr.File(file_path, visible=True)
-    # Step 3 Callbacks
-    train_models_btn.click(
-        fn=handle_model_training,
-        inputs=[app_state],
-        outputs=[status_step3_train, model_results_df, model_selector_s3, app_state]
-    )
-    # Update plots when model or feature count changes
-    for listener in [model_selector_s3.change, feature_count_s3.change]:
-        listener(
-            fn=update_analysis_plots,
-            inputs=[model_selector_s3, feature_count_s3, app_state],
-            outputs=[obs_pred_plot, residuals_plot, feature_importance_plot, model_metrics_summary, status_step3_train]
-        )
-    # Update model selector in Step 4 when models are trained
-    model_selector_s3.change(
-        fn=lambda choice: gr.update(choices=model_selector_s3.choices, value=choice),
-        inputs=model_selector_s3,
-        outputs=model_selector_s4
-    )
-    # Step 4 Callbacks
-    predict_btn.click(
-        fn=predict_on_upload,
-        inputs=[upload_new_mols, model_selector_s4, app_state],
-        outputs=[status_step4, predictions_output_df, mols_grid_s4]
-    ).then(
-        fn=lambda: gr.update(visible=True),
-        outputs=download_s4
-    )
-    @download_s4.click(inputs=predictions_output_df, outputs=download_s4, show_progress="hidden")
-    def download_predictions(df_predictions):
-        if df_predictions is None or df_predictions.empty:
-            raise gr.Error("No predictions to download.")
-        file_path = "predictions.csv"
-        df_predictions.to_csv(file_path, index=False)
-        return gr.File(file_path, visible=True)
-demo.launch()

 # --- IMPORTS ---
 # Core and Data Handling
+import gradio as gr #
+import pandas as pd #
+import numpy as np #
+import os #
+import glob #
+import time #
+import warnings #
 # Chemistry and Cheminformatics
+from rdkit import Chem #
+from rdkit.Chem import Descriptors, Lipinski #
+from chembl_webresource_client.new_client import new_client #
+from padelpy import padeldescriptor #
+import mols2grid #
 # Plotting and Visualization
+import matplotlib.pyplot as plt #
+import seaborn as sns #
+from scipy import stats #
+from scipy.stats import mannwhitneyu #
 # Machine Learning Models and Metrics
+from sklearn.model_selection import train_test_split #
+from sklearn.feature_selection import VarianceThreshold #
+from sklearn.linear_model import ( #
+    LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, #
+    HuberRegressor, PassiveAggressiveRegressor, OrthogonalMatchingPursuit, #
+    LassoLars #
 )
+from sklearn.tree import DecisionTreeRegressor #
+from sklearn.ensemble import ( #
+    RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, #
+    AdaBoostRegressor #
 )
+from sklearn.neighbors import KNeighborsRegressor #
+from sklearn.dummy import DummyRegressor #
+from sklearn.metrics import ( #
+    mean_absolute_error, mean_squared_error, r2_score #
 )
 # A placeholder class to store all results from a modeling run
+class ModelRunResult: #
+    def __init__(self, dataframe, plotter, models, selected_features): #
+        self.dataframe = dataframe #
+        self.plotter = plotter #
+        self.models = models #
+        self.selected_features = selected_features #
 # Optional Advanced Models
+try: #
+    import xgboost as xgb #
+    import lightgbm as lgb #
+    import catboost as cb #
+    _has_extra_libs = True #
+except ImportError: #
+    _has_extra_libs = False #
+    warnings.warn("Optional libraries (xgboost, lightgbm, catboost) not found. Some models will be unavailable.") #
 # --- GLOBAL CONFIGURATION & SETUP ---
+warnings.filterwarnings("ignore") #
+sns.set_theme(style='whitegrid') #
 # --- FINGERPRINT CONFIGURATION ---
 DESCRIPTOR_DIR = "padel_descriptors"
 # Check if the descriptor directory exists and contains files
 if not os.path.isdir(DESCRIPTOR_DIR):
     warnings.warn(
     xml_files = []
 else:
     xml_files = sorted(glob.glob(os.path.join(DESCRIPTOR_DIR, '*.xml')))
+if not xml_files:
+    warnings.warn(
+        f"No descriptor .xml files found in the '{DESCRIPTOR_DIR}' directory. "
+        "Fingerprint calculation will not be possible."
+    )
 # The key is the filename without extension; the value is the full path to the file
 fp_config = {os.path.splitext(os.path.basename(file))[0]: file for file in xml_files}
 FP_list = sorted(list(fp_config.keys()))
 # ==============================================================================
 # === STEP 1: CORE DATA COLLECTION & EDA FUNCTIONS ===
 # ==============================================================================
+def get_target_chembl_id(query): #
+    try: #
+        target = new_client.target #
+        res = target.search(query) #
+        if not res: #
+            return pd.DataFrame(), gr.Dropdown(choices=[], value=None), "No targets found for your query." #
+        df = pd.DataFrame(res) #
+        return df[["target_chembl_id", "pref_name", "organism"]], gr.Dropdown(choices=df["target_chembl_id"].tolist()), f"Found {len(df)} targets." #
+    except Exception as e: #
+        raise gr.Error(f"ChEMBL search failed: {e}") #
+def get_bioactivity_data(target_id): #
+    try: #
+        activity = new_client.activity #
+        res = activity.filter(target_chembl_id=target_id).filter(standard_type="IC50") #
+        if not res: #
+            return pd.DataFrame(), "No IC50 bioactivity data found for this target." #
+        df = pd.DataFrame(res) #
+        return df, f"Fetched {len(df)} data points." #
+    except Exception as e: #
+        raise gr.Error(f"Failed to fetch bioactivity data: {e}") #
+def pIC50_calc(input_df): #
+    df_copy = input_df.copy() #
+    df_copy['standard_value'] = pd.to_numeric(df_copy['standard_value'], errors='coerce') #
+    df_copy.dropna(subset=['standard_value'], inplace=True) #
+    df_copy['standard_value_norm'] = df_copy['standard_value'].apply(lambda x: min(x, 100000000)) #
+    pIC50_values = [] #
+    for i in df_copy['standard_value_norm']: #
+        if pd.notna(i) and i > 0: #
+            molar = i * (10**-9) #
+            pIC50_values.append(-np.log10(molar)) #
+        else: #
+            pIC50_values.append(np.nan) #
+    df_copy['pIC50'] = pIC50_values #
+    df_copy['bioactivity_class'] = df_copy['standard_value_norm'].apply( #
+        lambda x: "inactive" if pd.notna(x) and x >= 10000 else ("active" if pd.notna(x) and x <= 1000 else "intermediate") #
     )
+    return df_copy.drop(columns=['standard_value', 'standard_value_norm']) #
+def lipinski_descriptors(smiles_series): #
+    moldata, valid_smiles = [], [] #
+    for elem in smiles_series: #
+        if elem and isinstance(elem, str): #
+            mol = Chem.MolFromSmiles(elem) #
+            if mol: #
+                moldata.append(mol) #
+                valid_smiles.append(elem) #
+    descriptor_rows = [] #
+    for mol in moldata: #
+        row = [Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Lipinski.NumHDonors(mol), Lipinski.NumHAcceptors(mol)] #
+        descriptor_rows.append(row) #
+    columnNames = ["MW", "LogP", "NumHDonors", "NumHAcceptors"] #
+    if not descriptor_rows: return pd.DataFrame(columns=columnNames), [] #
+    return pd.DataFrame(data=np.array(descriptor_rows), columns=columnNames), valid_smiles #
+def clean_and_process_data(df): #
+    if df is None or df.empty: raise gr.Error("No data to process. Please fetch data first.") #
+    if "canonical_smiles" not in df.columns or df["canonical_smiles"].isnull().all(): #
+        try: #
+            df["canonical_smiles"] = [c.get("molecule_structures", {}).get("canonical_smiles") for c in new_client.molecule.get(list(df["molecule_chembl_id"]))] #
+        except Exception as e: #
+            raise gr.Error(f"Could not fetch SMILES from ChEMBL: {e}") #
+    df = df[df.standard_value.notna()] #
+    df = df[df.canonical_smiles.notna()] #
+    df.drop_duplicates(['canonical_smiles'], inplace=True) #
+    df["standard_value"] = pd.to_numeric(df["standard_value"], errors='coerce') #
+    df.dropna(subset=['standard_value'], inplace=True) #
+    df_processed = pIC50_calc(df) #
+    df_processed = df_processed[df_processed.pIC50.notna()] #
+    if df_processed.empty: return pd.DataFrame(), "No compounds remaining after pIC50 calculation." #
+    df_lipinski, valid_smiles = lipinski_descriptors(df_processed['canonical_smiles']) #
+    if not valid_smiles: return pd.DataFrame(), "No valid SMILES could be processed for Lipinski descriptors." #
+    df_processed = df_processed[df_processed['canonical_smiles'].isin(valid_smiles)].reset_index(drop=True) #
+    df_lipinski = df_lipinski.reset_index(drop=True) #
+    df_final = pd.concat([df_processed, df_lipinski], axis=1) #
+    return df_final, f"Processing complete. {len(df_final)} compounds remain after cleaning." #
+def run_eda_analysis(df, selected_classes): #
+    if df is None or df.empty: raise gr.Error("No data available for analysis.") #
+    df_filtered = df[df.bioactivity_class.isin(selected_classes)].copy() #
+    if df_filtered.empty: return (None, None, None, pd.DataFrame(), None, pd.DataFrame(), None, pd.DataFrame(), None, pd.DataFrame(), None, pd.DataFrame(), "No data for selected classes.") #
+    plots = [create_frequency_plot(df_filtered), create_scatter_plot(df_filtered)] #
+    stats_dfs = [] #
+    for desc in ['pIC50', 'MW', 'LogP', 'NumHDonors', 'NumHAcceptors']: #
+        plots.append(create_boxplot(df_filtered, desc)) #
+        stats_dfs.append(mannwhitney_test(df_filtered, desc)) #
+    plt.close('all') #
+    return (plots[0], plots[1], plots[2], stats_dfs[0], plots[3], stats_dfs[1], plots[4], stats_dfs[2], plots[5], stats_dfs[3], plots[6], stats_dfs[4], f"EDA complete for {len(df_filtered)} compounds.") #
+def create_frequency_plot(df): #
+    plt.figure(figsize=(5.5, 5.5)); sns.barplot(x=df['bioactivity_class'].value_counts().index, y=df['bioactivity_class'].value_counts().values, palette={'active': '#1f77b4', 'inactive': '#ff7f0e', 'intermediate': '#2ca02c'}); plt.xlabel('Bioactivity Class', fontsize=12); plt.ylabel('Frequency', fontsize=12); plt.title('Frequency of Bioactivity Classes', fontsize=14); return plt.gcf() #
+def create_scatter_plot(df): #
+    plt.figure(figsize=(5.5, 5.5)); sns.scatterplot(data=df, x='MW', y='LogP', hue='bioactivity_class', size='pIC50', palette={'active': '#1f77b4', 'inactive': '#ff7f0e', 'intermediate': '#2ca02c'}, sizes=(20, 200), alpha=0.7); plt.xlabel('Molecular Weight (MW)', fontsize=12); plt.ylabel('LogP', fontsize=12); plt.title('Chemical Space: MW vs. LogP', fontsize=14); plt.legend(title='Bioactivity Class'); return plt.gcf() #
+def create_boxplot(df, descriptor): #
+    plt.figure(figsize=(5.5, 5.5)); sns.boxplot(x='bioactivity_class', y=descriptor, data=df, palette={'active': '#1f77b4', 'inactive': '#ff7f0e', 'intermediate': '#2ca02c'}); plt.xlabel('Bioactivity Class', fontsize=12); plt.ylabel(descriptor, fontsize=12); plt.title(f'{descriptor} by Bioactivity Class', fontsize=14); return plt.gcf() #
+def mannwhitney_test(df, descriptor): #
+    results = [] #
+    for c1, c2 in [('active', 'inactive'), ('active', 'intermediate'), ('inactive', 'intermediate')]: #
+        if c1 in df['bioactivity_class'].unique() and c2 in df['bioactivity_class'].unique(): #
+            d1, d2 = df[df.bioactivity_class == c1][descriptor].dropna(), df[df.bioactivity_class == c2][descriptor].dropna() #
+            if not d1.empty and not d2.empty: #
+                stat, p = mannwhitneyu(d1, d2) #
+                results.append({'Comparison': f'{c1.title()} vs {c2.title()}', 'Statistics': stat, 'p-value': p, 'Interpretation': 'Different distribution (p < 0.05)' if p <= 0.05 else 'Same distribution (p > 0.05)'}) #
+    return pd.DataFrame(results) #
 # ==============================================================================
 # === STEP 2: FEATURE ENGINEERING FUNCTIONS ===
 # ==============================================================================
+def calculate_fingerprints(current_state, fingerprint_type, progress=gr.Progress()): #
+    input_df = current_state.get('cleaned_data') #
+    if input_df is None or input_df.empty: raise gr.Error("No cleaned data found. Please complete Step 1.") #
+    if not fingerprint_type: raise gr.Error("Please select a fingerprint type.") #
+    progress(0, desc="Starting..."); yield f"🧪 Starting fingerprint calculation...", None, gr.update(visible=False), None, current_state #
+    try: #
+        smi_file, output_csv = 'molecule.smi', 'fingerprints.csv' #
+        input_df[['canonical_smiles', 'canonical_smiles']].to_csv(smi_file, sep='\t', index=False, header=False) #
+        if os.path.exists(output_csv): os.remove(output_csv) #
+        descriptortypes = fp_config.get(fingerprint_type) #
+        if not descriptortypes: raise gr.Error(f"Descriptor XML for '{fingerprint_type}' not found.") #
+        progress(0.3, desc="⚗️ Running PaDEL..."); yield f"⚗️ Running PaDEL...", None, gr.update(visible=False), None, current_state #
+        padeldescriptor(mol_dir=smi_file, d_file=output_csv, descriptortypes=descriptortypes, detectaromaticity=True, standardizenitro=True, standardizetautomers=True, threads=-1, removesalt=True, log=False, fingerprints=True) #
+        if not os.path.exists(output_csv) or os.path.getsize(output_csv) == 0: #
+            raise gr.Error("PaDEL failed to produce an output file. Check molecule validity.") #
+        progress(0.7, desc="📊 Processing results..."); yield "📊 Processing results...", None, gr.update(visible=False), None, current_state #
+        df_X = pd.read_csv(output_csv).rename(columns={'Name': 'canonical_smiles'}) #
+        final_df = pd.merge(input_df[['canonical_smiles', 'pIC50']], df_X, on='canonical_smiles', how='inner') #
+        current_state['fingerprint_data'] = final_df; current_state['fingerprint_type'] = fingerprint_type #
+        progress(0.9, desc="🖼️ Generating molecule grid...") #
+        mols_html = mols2grid.display(final_df, smiles_col='canonical_smiles', subset=['img', 'pIC50'], rename={"pIC50": "pIC50"}, transform={"pIC50": lambda x: f"{x:.2f}"})._repr_html_() #
+        success_msg = f"✅ Success! Generated {len(df_X.columns) -1} descriptors for {len(final_df)} molecules." #
+        progress(1, desc="Completed!"); yield success_msg, final_df, gr.update(visible=True), gr.update(value=mols_html, visible=True), current_state #
+    except Exception as e: raise gr.Error(f"Calculation failed: {e}") #
+    finally: #
+        if os.path.exists('molecule.smi'): os.remove('molecule.smi') #
+        if os.path.exists('fingerprints.csv'): os.remove('fingerprints.csv') #
 # ==============================================================================
+# === STEP 3: MODEL TRAINING & PREDICTION FUNCTIONS ===
 # ==============================================================================
+class ModelPlotter: #
+    def __init__(self, models: dict, X_test: pd.DataFrame, y_test: pd.Series): #
+        self._models, self._X_test, self._y_test = models, X_test, y_test #
+    def plot_validation(self, model_name: str): #
+        if model_name not in self._models: raise ValueError(f"Model '{model_name}' not found.") #
+        model, y_pred = self._models[model_name], self._models[model_name].predict(self._X_test) #
+        residuals = self._y_test - y_pred #
+        fig, axes = plt.subplots(2, 2, figsize=(12, 10)); fig.suptitle(f'Model Validation Plots for {model_name}', fontsize=16, y=1.02) #
+        sns.scatterplot(x=self._y_test, y=y_pred, ax=axes[0, 0], alpha=0.6); axes[0, 0].set_title('Actual vs. Predicted'); axes[0, 0].set_xlabel('Actual pIC50'); axes[0, 0].set_ylabel('Predicted pIC50'); lims = [min(self._y_test.min(), y_pred.min()), max(self._y_test.max(), y_pred.max())]; axes[0, 0].plot(lims, lims, 'r--', alpha=0.75, zorder=0) #
+        sns.scatterplot(x=y_pred, y=residuals, ax=axes[0, 1], alpha=0.6); axes[0, 1].axhline(y=0, color='r', linestyle='--'); axes[0, 1].set_title('Residuals vs. Predicted'); axes[0, 1].set_xlabel('Predicted pIC50'); axes[0, 1].set_ylabel('Residuals') #
+        sns.histplot(residuals, kde=True, ax=axes[1, 0]); axes[1, 0].set_title('Distribution of Residuals') #
+        stats.probplot(residuals, dist="norm", plot=axes[1, 1]); axes[1, 1].set_title('Normal Q-Q Plot') #
+        plt.tight_layout(); return fig #
+    def plot_feature_importance(self, model_name: str, top_n: int = 7): #
+        if model_name not in self._models: raise ValueError(f"Model '{model_name}' not found.") #
+        model = self._models[model_name] #
+        if hasattr(model, 'feature_importances_'): importances = model.feature_importances_ #
+        elif hasattr(model, 'coef_'): importances = np.abs(model.coef_) #
+        else: return None #
+        top_features = pd.DataFrame({'Feature': self._X_test.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False).head(top_n) #
+        plt.figure(figsize=(10, top_n * 0.5)); sns.barplot(x='Importance', y='Feature', data=top_features, palette='viridis', orient='h'); plt.title(f'Top {top_n} Features for {model_name}'); plt.tight_layout(); return plt.gcf() #
+def run_regression_suite(df: pd.DataFrame, progress=gr.Progress()): #
+    progress(0, desc="Splitting data..."); yield "Splitting data (80/20 train/test split)...", None, None #
+    X = df.drop(columns=['pIC50', 'canonical_smiles'], errors='ignore') #
+    y = df['pIC50'] #
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #
+    progress(0.1, desc="Selecting features..."); yield "Performing feature selection (removing low variance)...", None, None #
+    selector = VarianceThreshold(threshold=0.1) #
+    X_train = pd.DataFrame(selector.fit_transform(X_train), columns=X_train.columns[selector.get_support()], index=X_train.index) #
+    X_test = pd.DataFrame(selector.transform(X_test), columns=X_test.columns[selector.get_support()], index=X_test.index) #
+    selected_features = X_train.columns.tolist() #
+    model_defs = [
+        ('Linear Regression', LinearRegression()),
+        ('Ridge', Ridge(random_state=42)),
+        ('Lasso', Lasso(random_state=42)),
+        ('Random Forest', RandomForestRegressor(random_state=42, n_jobs=-1)),
+        # ('Gradient Boosting', GradientBoostingRegressor(random_state=42)) # <-- Commented out
+    ]
+    if _has_extra_libs:
+        model_defs.extend([
+            # ('XGBoost', xgb.XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)), # <-- Commented out
+            ('LightGBM', lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbosity=-1)),
+            # ('CatBoost', cb.CatBoostRegressor(random_state=42, verbose=0)) # <-- Commented out
+        ])
+    results_list, trained_models = [], {} #
+    for i, (name, model) in enumerate(model_defs): #
+        progress(0.2 + (i / len(model_defs)) * 0.8, desc=f"Training {name}...") #
+        yield f"Training {i+1}/{len(model_defs)}: {name}...", None, None #
+        start_time = time.time(); model.fit(X_train, y_train); y_pred = model.predict(X_test) #
+        results_list.append({'Model': name, 'R²': r2_score(y_test, y_pred), 'MAE': mean_absolute_error(y_test, y_pred), 'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)), 'Time (s)': f"{time.time() - start_time:.2f}"}) #
+        trained_models[name] = model #
+    results_df = pd.DataFrame(results_list).sort_values(by='R²', ascending=False).reset_index(drop=True) #
+    plotter = ModelPlotter(trained_models, X_test, y_test) #
+    model_run_results = ModelRunResult(results_df, plotter, trained_models, selected_features) #
+    model_choices = results_df['Model'].tolist() #
+    yield "✅ Model training & evaluation complete.", model_run_results, gr.Dropdown(choices=model_choices, interactive=True) #
+def predict_on_upload(uploaded_file, model_name, current_state, progress=gr.Progress()): #
+    if not uploaded_file: raise gr.Error("Please upload a file.") #
+    if not model_name: raise gr.Error("Please select a trained model.") #
+    model_run_results = current_state.get('model_results') #
+    fingerprint_type = current_state.get('fingerprint_type') #
+    if not model_run_results or not fingerprint_type: raise gr.Error("Please run Steps 2 and 3 first.") #
+    model = model_run_results.models.get(model_name) #
+    selected_features = model_run_results.selected_features #
+    if model is None: raise gr.Error(f"Model '{model_name}' not found.") #
+    smi_file, output_csv = 'predict.smi', 'predict_fp.csv' #
+    try: #
+        progress(0, desc="Reading & processing new molecules..."); yield "Reading uploaded file...", None, None #
+        df_new = pd.read_csv(uploaded_file.name) #
+        if 'canonical_smiles' not in df_new.columns: raise gr.Error("CSV must contain a 'canonical_smiles' column.") #
+        df_new = df_new.reset_index().rename(columns={'index': 'mol_id'}) #
+        padel_input = pd.DataFrame({'smiles': df_new['canonical_smiles'], 'name': df_new['mol_id']}) #
+        padel_input.to_csv(smi_file, sep='\t', index=False, header=False) #
+        if os.path.exists(output_csv): os.remove(output_csv) #
+        progress(0.3, desc="Calculating fingerprints..."); yield "Calculating fingerprints for new molecules...", None, None #
+        padeldescriptor(mol_dir=smi_file, d_file=output_csv, descriptortypes=fp_config.get(fingerprint_type), detectaromaticity=True, standardizenitro=True, threads=-1, removesalt=True, log=False, fingerprints=True) #
+        if not os.path.exists(output_csv) or os.path.getsize(output_csv) == 0: raise gr.Error("PaDEL calculation failed for the uploaded molecules.") #
+        progress(0.7, desc="Aligning features and predicting..."); yield "Aligning features and predicting...", None, None #
+        df_fp = pd.read_csv(output_csv).rename(columns={'Name': 'mol_id'}) #
+        X_new = df_fp.set_index('mol_id') #
+        X_new_aligned = X_new.reindex(columns=selected_features, fill_value=0)[selected_features] #
+        predictions = model.predict(X_new_aligned) #
+        results_subset = pd.DataFrame({'mol_id': X_new_aligned.index, 'predicted_pIC50': predictions}) #
+        df_results = pd.merge(df_new, results_subset, on='mol_id', how='left') #
+        progress(0.9, desc="Generating visualization..."); yield "Generating visualization...", None, None #
+        df_grid_view = df_results.dropna(subset=['predicted_pIC50']).copy() #
+        mols_html = "<h3>No molecules with successful predictions to display.</h3>" #
+        if not df_grid_view.empty: #
+            df_grid_view.rename(columns={"predicted_pIC50": "Predicted pIC50"}, inplace=True) #
+            mols_html = mols2grid.display( #
+                df_grid_view, #
+                smiles_col='canonical_smiles', #
+                subset=['img', 'Predicted pIC50'], #
+                transform={"Predicted pIC50": lambda x: f"{x:.2f}"} #
+            )._repr_html_() #
+        progress(1, desc="Complete!"); yield "✅ Prediction complete.", df_results[['canonical_smiles', 'predicted_pIC50']], mols_html #
+    finally: #
+        if os.path.exists(smi_file): os.remove(smi_file) #
+        if os.path.exists(output_csv): os.remove(output_csv) #
 # ==============================================================================
+# === GRADIO INTERFACE ===
 # ==============================================================================
+with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky"), title="Comprehensive Drug Discovery Workflow") as demo: #
+    gr.Markdown("# 🧪 Comprehensive Drug Discovery Workflow") #
+    gr.Markdown("A 3-step application to fetch, analyze, and model chemical bioactivity data.") #
+    app_state = gr.State({}) #
+    with gr.Tabs(): #
+        with gr.Tab("Step 1: Data Collection & EDA"): #
+            gr.Markdown("## Fetch Bioactivity Data from ChEMBL and Perform Exploratory Analysis") #
+            with gr.Row(): #
+                query_input = gr.Textbox(label="Target Query", placeholder="e.g., acetylcholinesterase, BRAF kinase", scale=3) #
+                fetch_btn = gr.Button("Fetch Targets", variant="primary", scale=1) #
+            status_step1_fetch = gr.Textbox(label="Status", interactive=False) #
+            target_id_table = gr.Dataframe(label="Available Targets", interactive=False, headers=["target_chembl_id", "pref_name", "organism"]) #
+            with gr.Row(): #
+                selected_target_dropdown = gr.Dropdown(label="Select Target ChEMBL ID", interactive=True, scale=3) #
+                process_btn = gr.Button("Process Data & Run EDA", variant="primary", scale=1, interactive=False) #
+            status_step1_process = gr.Textbox(label="Status", interactive=False) #
+            gr.Markdown("### Filtered Data & Analysis") #
+            bioactivity_class_selector = gr.CheckboxGroup(["active", "inactive", "intermediate"], label="Filter by Bioactivity Class", value=["active", "inactive", "intermediate"]) #
+            df_output_s1 = gr.Dataframe(label="Cleaned Bioactivity Data") #
+            with gr.Tabs(): #
+                with gr.Tab("Chemical Space Overview"): #
+                    with gr.Row(): #
+                        freq_plot_output = gr.Plot(label="Frequency of Bioactivity Classes") #
+                        scatter_plot_output = gr.Plot(label="Scatter Plot: MW vs LogP") #
+                with gr.Tab("pIC50 Analysis"): #
+                    with gr.Row(): #
+                        pic50_plot_output = gr.Plot(label="pIC50 Box Plot") #
+                        pic50_stats_output = gr.Dataframe(label="Mann-Whitney U Test Results for pIC50") #
+                with gr.Tab("Molecular Weight Analysis"): #
+                    with gr.Row(): #
+                        mw_plot_output = gr.Plot(label="MW Box Plot") #
+                        mw_stats_output = gr.Dataframe(label="Mann-Whitney U Test Results for MW") #
+                with gr.Tab("LogP Analysis"): #
+                    with gr.Row(): #
+                        logp_plot_output = gr.Plot(label="LogP Box Plot") #
+                        logp_stats_output = gr.Dataframe(label="Mann-Whitney U Test Results for LogP") #
+                with gr.Tab("H-Bond Donor/Acceptor Analysis"): #
+                    with gr.Row(): #
+                        hdonors_plot_output = gr.Plot(label="H-Donors Box Plot") #
+                        hacceptors_plot_output = gr.Plot(label="H-Acceptors Box Plot") #
+                    with gr.Row(): #
+                        hdonors_stats_output = gr.Dataframe(label="Stats for H-Donors") #
+                        hacceptors_stats_output = gr.Dataframe(label="Stats for H-Acceptors") #
+        with gr.Tab("Step 2: Feature Engineering"): #
+            gr.Markdown("## Calculate Molecular Fingerprints using PaDEL") #
+            with gr.Row(): #
+                fingerprint_dropdown = gr.Dropdown(choices=FP_list, value='PubChem' if 'PubChem' in FP_list else None, label="Select Fingerprint Method", scale=3) #
+                calculate_fp_btn = gr.Button("Calculate Fingerprints", variant="primary", scale=1) #
+            status_step2 = gr.Textbox(label="Status", interactive=False) #
+            output_df_s2 = gr.Dataframe(label="Final Processed Data", wrap=True) #
+            download_s2 = gr.DownloadButton("Download Feature Data (CSV)", variant="secondary", visible=False) #
+            mols_grid_s2 = gr.HTML(label="Interactive Molecule Viewer") #
+        with gr.Tab("Step 3: Model Training & Prediction"): #
+            gr.Markdown("## Train Regression Models and Predict pIC50") #
+            with gr.Tabs(): #
+                with gr.Tab("Model Training & Evaluation"): #
+                    train_models_btn = gr.Button("Train All Models", variant="primary") #
+                    status_step3_train = gr.Textbox(label="Status", interactive=False) #
+                    model_results_df = gr.DataFrame(label="Ranked Model Results", interactive=False) #
+                    with gr.Row(): #
+                        model_selector_s3 = gr.Dropdown(label="Select Model to Analyze", interactive=False) #
+                        feature_count_s3 = gr.Number(label="Top Features to Show", value=7, minimum=3, maximum=20, step=1) #
+                    with gr.Tabs(): #
+                        with gr.Tab("Validation Plots"): validation_plot_s3 = gr.Plot(label="Model Validation Plots") #
+                        with gr.Tab("Feature Importance"): feature_plot_s3 = gr.Plot(label="Top Feature Importances") #
+                with gr.Tab("Predict on New Data"): #
+                    gr.Markdown("Upload a CSV with a `canonical_smiles` column to predict pIC50.") #
+                    with gr.Row(): #
+                        upload_predict_file = gr.File(label="Upload CSV for Prediction", file_types=[".csv"]) #
+                        predict_btn_s3 = gr.Button("Run Prediction", variant="primary") #
+                    status_step3_predict = gr.Textbox(label="Status", interactive=False) #
+                    prediction_results_df = gr.DataFrame(label="Prediction Results") #
+                    prediction_mols_grid = gr.HTML(label="Interactive Molecular Grid of Predictions") #
     # --- EVENT HANDLERS ---
+    def enable_process_button(target_id): return gr.update(interactive=bool(target_id)) #
+    def process_and_analyze_wrapper(target_id, selected_classes, current_state, progress=gr.Progress()): #
+        if not target_id: raise gr.Error("Please select a target ChEMBL ID first.") #
+        progress(0, desc="Fetching data..."); raw_data, msg1 = get_bioactivity_data(target_id); yield {status_step1_process: gr.update(value=msg1)} #
+        progress(0.3, desc="Cleaning data..."); processed_data, msg2 = clean_and_process_data(raw_data); yield {df_output_s1: processed_data, status_step1_process: gr.update(value=msg2)} #
+        current_state['cleaned_data'] = processed_data #
+        progress(0.6, desc="Running EDA..."); plots_and_stats = run_eda_analysis(processed_data, selected_classes); msg3 = plots_and_stats[-1] #
+        progress(1, desc="Done!") #
+        filtered_data = processed_data[processed_data.bioactivity_class.isin(selected_classes)] if not processed_data.empty else pd.DataFrame() #
+        outputs = [filtered_data] + list(plots_and_stats[:-1]) + [msg3, current_state] #
+        output_components = [df_output_s1, freq_plot_output, scatter_plot_output, pic50_plot_output, pic50_stats_output, mw_plot_output, mw_stats_output, logp_plot_output, logp_stats_output, hdonors_plot_output, hdonors_stats_output, hacceptors_plot_output, hacceptors_stats_output, status_step1_process, app_state] #
+        yield dict(zip(output_components, outputs)) #
+    def update_analysis_on_filter_change(selected_classes, current_state): #
+        cleaned_data = current_state.get('cleaned_data') #
+        if cleaned_data is None or cleaned_data.empty: return (pd.DataFrame(),) + (None,) * 11 + ("No data available.",) #
+        plots_and_stats = run_eda_analysis(cleaned_data, selected_classes); msg = plots_and_stats[-1] #
+        filtered_data = cleaned_data[cleaned_data.bioactivity_class.isin(selected_classes)] #
+        return (filtered_data,) + plots_and_stats[:-1] + (msg,) #
+    def handle_model_training(current_state, progress=gr.Progress(track_tqdm=True)): #
+        fingerprint_data = current_state.get('fingerprint_data') #
+        if fingerprint_data is None or fingerprint_data.empty: raise gr.Error("No feature data. Please complete Step 2.") #
+        for status_msg, model_results, model_choices_update in run_regression_suite(fingerprint_data, progress=progress): #
+            if model_results: current_state['model_results'] = model_results #
+            yield status_msg, model_results.dataframe if model_results else None, model_choices_update, current_state #
+    def save_dataframe_as_csv(df): #
+        if df is None or df.empty: return None #
+        filename = "feature_engineered_data.csv"; df.to_csv(filename, index=False); return gr.File(value=filename, visible=True) #
+    def update_analysis_plots(model_name, feature_count, current_state): #
+        model_results = current_state.get('model_results') #
+        if not model_results or not model_name: return None, None #
+        plotter = model_results.plotter; validation_fig = plotter.plot_validation(model_name); feature_fig = plotter.plot_feature_importance(model_name, int(feature_count)); plt.close('all'); return validation_fig, feature_fig #
+    fetch_btn.click(fn=get_target_chembl_id, inputs=query_input, outputs=[target_id_table, selected_target_dropdown, status_step1_fetch], show_progress="minimal") #
+    selected_target_dropdown.change(fn=enable_process_button, inputs=selected_target_dropdown, outputs=process_btn, show_progress="hidden") #
+    process_btn.click(fn=process_and_analyze_wrapper, inputs=[selected_target_dropdown, bioactivity_class_selector, app_state], outputs=[df_output_s1, freq_plot_output, scatter_plot_output, pic50_plot_output, pic50_stats_output, mw_plot_output, mw_stats_output, logp_plot_output, logp_stats_output, hdonors_plot_output, hdonors_stats_output, hacceptors_plot_output, hacceptors_stats_output, status_step1_process, app_state]) #
+    bioactivity_class_selector.change(fn=update_analysis_on_filter_change, inputs=[bioactivity_class_selector, app_state], outputs=[df_output_s1, freq_plot_output, scatter_plot_output, pic50_plot_output, pic50_stats_output, mw_plot_output, mw_stats_output, logp_plot_output, logp_stats_output, hdonors_plot_output, hdonors_stats_output, hacceptors_plot_output, hacceptors_stats_output, status_step1_process], show_progress="minimal") #
+    calculate_fp_btn.click(fn=calculate_fingerprints, inputs=[app_state, fingerprint_dropdown], outputs=[status_step2, output_df_s2, download_s2, mols_grid_s2, app_state]) #
+    @download_s2.click(inputs=app_state, outputs=download_s2, show_progress="hidden") #
+    def download_handler(current_state): #
+        df_to_download = current_state.get('fingerprint_data') #
+        return save_dataframe_as_csv(df_to_download) #
+    train_models_btn.click(fn=handle_model_training, inputs=[app_state], outputs=[status_step3_train, model_results_df, model_selector_s3, app_state]) #
+    for listener in [model_selector_s3.change, feature_count_s3.change]: listener(fn=update_analysis_plots, inputs=[model_selector_s3, feature_count_s3, app_state], outputs=[validation_plot_s3, feature_plot_s3], show_progress="minimal") #
+    predict_btn_s3.click(fn=predict_on_upload, inputs=[upload_predict_file, model_selector_s3, app_state], outputs=[status_step3_predict, prediction_results_df, prediction_mols_grid]) #
+if __name__ == "__main__": #
+    demo.launch(debug=True) #