Spaces:

alidenewade
/

drug-discovery-app

Sleeping

App Files Files Community

alidenewade commited on Jun 9

Commit

c3673c6

verified ·

1 Parent(s): b102a7f

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -163

app.py CHANGED Viewed

@@ -7,13 +7,17 @@ import os
 import glob
 import time
 import warnings
-import base64
 # Chemistry and Cheminformatics
 from rdkit import Chem
-from rdkit.Chem import Descriptors, Lipinski, Draw, rdDepictor
 from chembl_webresource_client.new_client import new_client
 from padelpy import padeldescriptor
 # Plotting and Visualization
 import matplotlib.pyplot as plt
@@ -63,25 +67,20 @@ warnings.filterwarnings("ignore")
 sns.set_theme(style='whitegrid')
 # --- FINGERPRINT CONFIGURATION ---
-DESCRIPTOR_DIR = "padel_descriptors"
-# Check if the descriptor directory exists and contains files
-if not os.path.isdir(DESCRIPTOR_DIR):
-    warnings.warn(
-        f"The descriptor directory '{DESCRIPTOR_DIR}' was not found. "
-        "Fingerprint calculation will be disabled. Please create this directory and upload your .xml files."
-    )
-    xml_files = []
-else:
-    xml_files = sorted(glob.glob(os.path.join(DESCRIPTOR_DIR, '*.xml')))
 if not xml_files:
     warnings.warn(
-        f"No descriptor .xml files found in the '{DESCRIPTOR_DIR}' directory. "
-        "Fingerprint calculation will not be possible."
     )
-# The key is the filename without extension; the value is the full path to the file
 fp_config = {os.path.splitext(os.path.basename(file))[0]: file for file in xml_files}
 FP_list = sorted(list(fp_config.keys()))
@@ -155,6 +154,7 @@ def clean_and_process_data(df):
             raise gr.Error(f"Could not fetch SMILES from ChEMBL: {e}")
     df = df[df.standard_value.notna()]
     df = df[df.canonical_smiles.notna()]
     df.drop_duplicates(['canonical_smiles'], inplace=True)
     df["standard_value"] = pd.to_numeric(df["standard_value"], errors='coerce')
     df.dropna(subset=['standard_value'], inplace=True)
@@ -200,58 +200,68 @@ def mannwhitney_test(df, descriptor):
 # === STEP 2: FEATURE ENGINEERING FUNCTIONS ===
 # ==============================================================================
 def calculate_fingerprints(current_state, fingerprint_type, progress=gr.Progress()):
     input_df = current_state.get('cleaned_data')
-    if input_df is None or input_df.empty:
-        raise gr.Error("No cleaned data found. Please complete Step 1.")
-    if not fingerprint_type:
-        raise gr.Error("Please select a fingerprint type.")
-    progress(0, desc="Starting...")
-    yield f"🧪 Starting fingerprint calculation...", None, gr.update(visible=False), None, current_state
     try:
         smi_file, output_csv = 'molecule.smi', 'fingerprints.csv'
         input_df[['canonical_smiles', 'canonical_smiles']].to_csv(smi_file, sep='\t', index=False, header=False)
-        if os.path.exists(output_csv):
-            os.remove(output_csv)
         descriptortypes = fp_config.get(fingerprint_type)
-        if not descriptortypes:
-            raise gr.Error(f"Descriptor XML for '{fingerprint_type}' not found.")
-        progress(0.3, desc="⚗️ Running PaDEL...")
-        yield f"⚗️ Running PaDEL...", None, gr.update(visible=False), None, current_state
         padeldescriptor(mol_dir=smi_file, d_file=output_csv, descriptortypes=descriptortypes, detectaromaticity=True, standardizenitro=True, standardizetautomers=True, threads=-1, removesalt=True, log=False, fingerprints=True)
         if not os.path.exists(output_csv) or os.path.getsize(output_csv) == 0:
             raise gr.Error("PaDEL failed to produce an output file. Check molecule validity.")
-        progress(0.7, desc="📊 Processing results...")
-        yield "📊 Processing results...", None, gr.update(visible=False), None, current_state
         df_X = pd.read_csv(output_csv).rename(columns={'Name': 'canonical_smiles'})
         final_df = pd.merge(input_df[['canonical_smiles', 'pIC50']], df_X, on='canonical_smiles', how='inner')
-        current_state['fingerprint_data'] = final_df
-        current_state['fingerprint_type'] = fingerprint_type
-        progress(0.9, desc="🖼️ Generating molecule grid...")
-        mols_html = create_molecule_html_grid(final_df, 'canonical_smiles', ['pIC50'])
         success_msg = f"✅ Success! Generated {len(df_X.columns) -1} descriptors for {len(final_df)} molecules."
-        progress(1, desc="Completed!")
-        yield success_msg, final_df, gr.update(visible=True), gr.update(value=mols_html, visible=True), current_state
-    except Exception as e:
-        raise gr.Error(f"Calculation failed: {e}")
     finally:
-        if os.path.exists('molecule.smi'):
-            os.remove('molecule.smi')
-        if os.path.exists('fingerprints.csv'):
-            os.remove('fingerprints.csv')
 # ==============================================================================
 # === STEP 3: MODEL TRAINING & PREDICTION FUNCTIONS ===
@@ -290,20 +300,9 @@ def run_regression_suite(df: pd.DataFrame, progress=gr.Progress()):
     X_test = pd.DataFrame(selector.transform(X_test), columns=X_test.columns[selector.get_support()], index=X_test.index)
     selected_features = X_train.columns.tolist()
-    model_defs = [
-        ('Linear Regression', LinearRegression()),
-        ('Ridge', Ridge(random_state=42)),
-        ('Lasso', Lasso(random_state=42)),
-        ('Random Forest', RandomForestRegressor(random_state=42, n_jobs=-1)),
-        # ('Gradient Boosting', GradientBoostingRegressor(random_state=42)) # <-- Commented out
-    ]
-    if _has_extra_libs:
-        model_defs.extend([
-            # ('XGBoost', xgb.XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)), # <-- Commented out
-            ('LightGBM', lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbosity=-1)),
-            # ('CatBoost', cb.CatBoostRegressor(random_state=42, verbose=0)) # <-- Commented out
-        ])
     results_list, trained_models = [], {}
     for i, (name, model) in enumerate(model_defs):
         progress(0.2 + (i / len(model_defs)) * 0.8, desc=f"Training {name}...")
@@ -315,139 +314,89 @@ def run_regression_suite(df: pd.DataFrame, progress=gr.Progress()):
     results_df = pd.DataFrame(results_list).sort_values(by='R²', ascending=False).reset_index(drop=True)
     plotter = ModelPlotter(trained_models, X_test, y_test)
     model_run_results = ModelRunResult(results_df, plotter, trained_models, selected_features)
     model_choices = results_df['Model'].tolist()
     yield "✅ Model training & evaluation complete.", model_run_results, gr.Dropdown(choices=model_choices, interactive=True)
 def predict_on_upload(uploaded_file, model_name, current_state, progress=gr.Progress()):
     if not uploaded_file: raise gr.Error("Please upload a file.")
     if not model_name: raise gr.Error("Please select a trained model.")
     model_run_results = current_state.get('model_results')
     fingerprint_type = current_state.get('fingerprint_type')
     if not model_run_results or not fingerprint_type: raise gr.Error("Please run Steps 2 and 3 first.")
     model = model_run_results.models.get(model_name)
     selected_features = model_run_results.selected_features
     if model is None: raise gr.Error(f"Model '{model_name}' not found.")
     smi_file, output_csv = 'predict.smi', 'predict_fp.csv'
     try:
         progress(0, desc="Reading & processing new molecules..."); yield "Reading uploaded file...", None, None
         df_new = pd.read_csv(uploaded_file.name)
         if 'canonical_smiles' not in df_new.columns: raise gr.Error("CSV must contain a 'canonical_smiles' column.")
         df_new = df_new.reset_index().rename(columns={'index': 'mol_id'})
         padel_input = pd.DataFrame({'smiles': df_new['canonical_smiles'], 'name': df_new['mol_id']})
         padel_input.to_csv(smi_file, sep='\t', index=False, header=False)
         if os.path.exists(output_csv): os.remove(output_csv)
         progress(0.3, desc="Calculating fingerprints..."); yield "Calculating fingerprints for new molecules...", None, None
         padeldescriptor(mol_dir=smi_file, d_file=output_csv, descriptortypes=fp_config.get(fingerprint_type), detectaromaticity=True, standardizenitro=True, threads=-1, removesalt=True, log=False, fingerprints=True)
         if not os.path.exists(output_csv) or os.path.getsize(output_csv) == 0: raise gr.Error("PaDEL calculation failed for the uploaded molecules.")
         progress(0.7, desc="Aligning features and predicting..."); yield "Aligning features and predicting...", None, None
         df_fp = pd.read_csv(output_csv).rename(columns={'Name': 'mol_id'})
         X_new = df_fp.set_index('mol_id')
         X_new_aligned = X_new.reindex(columns=selected_features, fill_value=0)[selected_features]
         predictions = model.predict(X_new_aligned)
         results_subset = pd.DataFrame({'mol_id': X_new_aligned.index, 'predicted_pIC50': predictions})
         df_results = pd.merge(df_new, results_subset, on='mol_id', how='left')
         progress(0.9, desc="Generating visualization..."); yield "Generating visualization...", None, None
         df_grid_view = df_results.dropna(subset=['predicted_pIC50']).copy()
-        mols_html = create_molecule_html_grid(
-            df_grid_view,
-            smiles_col='canonical_smiles',
-            data_cols=['predicted_pIC50'],
-            mol_id_col='mol_id'
-        )
         progress(1, desc="Complete!"); yield "✅ Prediction complete.", df_results[['canonical_smiles', 'predicted_pIC50']], mols_html
     finally:
         if os.path.exists(smi_file): os.remove(smi_file)
         if os.path.exists(output_csv): os.remove(output_csv)
-# ==============================================================================
-# === HELPER FUNCTIONS ===
-# ==============================================================================
-def create_molecule_html_grid(df: pd.DataFrame, smiles_col: str, data_cols: list, mol_id_col: str = None):
-    """
-    Generates a self-contained HTML grid for a DataFrame of molecules.
-    Args:
-        df: DataFrame containing molecule data.
-        smiles_col: The name of the column with the SMILES strings.
-        data_cols: A list of column names to display alongside the molecule.
-        mol_id_col: Optional column to use as a title for each molecule entry.
-    Returns:
-        An HTML string for display in Gradio's gr.HTML component.
-    """
-    if df.empty:
-        return "<h3>No molecules to display.</h3>"
-    # Step 1: Filter valid molecules with 2D conformers
-    valid_mols = []
-    for smiles in df[smiles_col]:
-        mol = Chem.MolFromSmiles(smiles)
-        if mol:
-            try:
-                rdDepictor.Compute2DCoords(mol)
-                _ = mol.GetConformer()  # Ensure conformer exists
-                valid_mols.append((smiles, mol))
-            except Exception as e:
-                print(f"[Warning] Skipping molecule due to depiction error: {smiles} – {e}")
-                continue
-    if not valid_mols:
-        return "<h3>No valid molecules could be rendered.</h3>"
-    # Step 2: Generate SVGs
-    images = []
-    smiles_list = []
-    for smiles, mol in valid_mols:
-        try:
-            svg = Draw.MolToSVG(mol, width=200, height=200)
-            images.append(svg)
-            smiles_list.append(smiles)
-        except Exception as e:
-            print(f"[Warning] Failed to draw molecule: {smiles} – {e}")
-            continue
-    # Filter the DataFrame to include only valid molecules
-    df = df[df[smiles_col].isin(smiles_list)].copy()
-    df['image'] = images
-    # Step 3: Build HTML
-    html = '<div style="display: flex; flex-wrap: wrap; gap: 20px;">'
-    for _, row in df.iterrows():
-        if not row['image']:
-            continue
-        html += '<div style="border: 1px solid #ddd; border-radius: 5px; padding: 10px; text-align: center; width: 220px;">'
-        html += row['image']  # SVG
-        # Optional: molecule ID
-        if mol_id_col and mol_id_col in row:
-            html += f'<strong>{row[mol_id_col]}</strong><br>'
-        # Show other data values
-        for col in data_cols:
-            if col in row:
-                value = row[col]
-                if isinstance(value, float):
-                    value = f"{value:.2f}"
-                html += f'<span><strong>{col}:</strong> {value}</span><br>'
-        html += '</div>'
-    html += '</div>'
-    return html
 # ==============================================================================
 # === GRADIO INTERFACE ===
 # ==============================================================================
@@ -457,6 +406,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky"),
     app_state = gr.State({})
     with gr.Tabs():
         with gr.Tab("Step 1: Data Collection & EDA"):
             gr.Markdown("## Fetch Bioactivity Data from ChEMBL and Perform Exploratory Analysis")
             with gr.Row():
                 query_input = gr.Textbox(label="Target Query", placeholder="e.g., acetylcholinesterase, BRAF kinase", scale=3)
@@ -495,6 +445,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky"),
                         hdonors_stats_output = gr.Dataframe(label="Stats for H-Donors")
                         hacceptors_stats_output = gr.Dataframe(label="Stats for H-Acceptors")
         with gr.Tab("Step 2: Feature Engineering"):
             gr.Markdown("## Calculate Molecular Fingerprints using PaDEL")
             with gr.Row():
                 fingerprint_dropdown = gr.Dropdown(choices=FP_list, value='PubChem' if 'PubChem' in FP_list else None, label="Select Fingerprint Method", scale=3)
@@ -504,6 +455,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky"),
             download_s2 = gr.DownloadButton("Download Feature Data (CSV)", variant="secondary", visible=False)
             mols_grid_s2 = gr.HTML(label="Interactive Molecule Viewer")
         with gr.Tab("Step 3: Model Training & Prediction"):
             gr.Markdown("## Train Regression Models and Predict pIC50")
             with gr.Tabs():
                 with gr.Tab("Model Training & Evaluation"):
@@ -563,12 +515,11 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky"),
     process_btn.click(fn=process_and_analyze_wrapper, inputs=[selected_target_dropdown, bioactivity_class_selector, app_state], outputs=[df_output_s1, freq_plot_output, scatter_plot_output, pic50_plot_output, pic50_stats_output, mw_plot_output, mw_stats_output, logp_plot_output, logp_stats_output, hdonors_plot_output, hdonors_stats_output, hacceptors_plot_output, hacceptors_stats_output, status_step1_process, app_state])
     bioactivity_class_selector.change(fn=update_analysis_on_filter_change, inputs=[bioactivity_class_selector, app_state], outputs=[df_output_s1, freq_plot_output, scatter_plot_output, pic50_plot_output, pic50_stats_output, mw_plot_output, mw_stats_output, logp_plot_output, logp_stats_output, hdonors_plot_output, hdonors_stats_output, hacceptors_plot_output, hacceptors_stats_output, status_step1_process], show_progress="minimal")
     calculate_fp_btn.click(fn=calculate_fingerprints, inputs=[app_state, fingerprint_dropdown], outputs=[status_step2, output_df_s2, download_s2, mols_grid_s2, app_state])
     @download_s2.click(inputs=app_state, outputs=download_s2, show_progress="hidden")
     def download_handler(current_state):
         df_to_download = current_state.get('fingerprint_data')
         return save_dataframe_as_csv(df_to_download)
     train_models_btn.click(fn=handle_model_training, inputs=[app_state], outputs=[status_step3_train, model_results_df, model_selector_s3, app_state])
     for listener in [model_selector_s3.change, feature_count_s3.change]: listener(fn=update_analysis_plots, inputs=[model_selector_s3, feature_count_s3, app_state], outputs=[validation_plot_s3, feature_plot_s3], show_progress="minimal")
     predict_btn_s3.click(fn=predict_on_upload, inputs=[upload_predict_file, model_selector_s3, app_state], outputs=[status_step3_predict, prediction_results_df, prediction_mols_grid])

 import glob
 import time
 import warnings
 # Chemistry and Cheminformatics
 from rdkit import Chem
+from rdkit.Chem import Descriptors, Lipinski
 from chembl_webresource_client.new_client import new_client
 from padelpy import padeldescriptor
+from rdkit.Chem.Draw import rdMolDraw2D
+from rdkit.Chem import Draw
+import base64
+from io import BytesIO
 # Plotting and Visualization
 import matplotlib.pyplot as plt
 sns.set_theme(style='whitegrid')
 # --- FINGERPRINT CONFIGURATION ---
+# Create a dummy PubChem.xml if no XML files are found, to ensure fp_config is populated
+if not glob.glob('*.xml'):
+    try:
+        with open('PubChem.xml', 'w') as f:
+            f.write('')
+    except IOError:
+        warnings.warn("Could not create a dummy 'PubChem.xml' file. Fingerprint calculation might fail if no .xml files are present.")
+xml_files = sorted(glob.glob('*.xml'))
 if not xml_files:
     warnings.warn(
+        "No descriptor .xml files found. Fingerprint calculation will not be possible. "
+        "Please place descriptor XML files in the same directory as the script."
     )
 fp_config = {os.path.splitext(os.path.basename(file))[0]: file for file in xml_files}
 FP_list = sorted(list(fp_config.keys()))
             raise gr.Error(f"Could not fetch SMILES from ChEMBL: {e}")
     df = df[df.standard_value.notna()]
     df = df[df.canonical_smiles.notna()]
+    # DEBUG FIX: Added drop_duplicates to align with notebook logic and ensure unique SMILES for merging.
     df.drop_duplicates(['canonical_smiles'], inplace=True)
     df["standard_value"] = pd.to_numeric(df["standard_value"], errors='coerce')
     df.dropna(subset=['standard_value'], inplace=True)
 # === STEP 2: FEATURE ENGINEERING FUNCTIONS ===
 # ==============================================================================
+def create_molecule_grid_html(df, smiles_col='canonical_smiles', max_mols=20):
+    html_parts = ['<div style="display: flex; flex-wrap: wrap; gap: 10px;">']
+    for idx, row in df.head(max_mols).iterrows():
+        smiles = row[smiles_col]
+        pic50 = row['pIC50']
+        mol = Chem.MolFromSmiles(smiles)
+        if mol:
+            # Generate molecule image
+            img = Draw.MolToImage(mol, size=(200, 200))
+            # Convert to base64
+            buffered = BytesIO()
+            img.save(buffered, format="PNG")
+            img_str = base64.b64encode(buffered.getvalue()).decode()
+            # Create HTML for this molecule
+            mol_html = f'''
+            <div style="border: 1px solid #ccc; padding: 10px; border-radius: 5px; text-align: center;">
+                <img src="data:image/png;base64,{img_str}" alt="Molecule" style="max-width: 200px;">
+                <p><strong>pIC50:</strong> {pic50:.2f}</p>
+                <p style="font-size: 10px; word-break: break-all;">{smiles}</p>
+            </div>
+            '''
+            html_parts.append(mol_html)
+    html_parts.append('</div>')
+    return ''.join(html_parts)
 def calculate_fingerprints(current_state, fingerprint_type, progress=gr.Progress()):
     input_df = current_state.get('cleaned_data')
+    if input_df is None or input_df.empty: raise gr.Error("No cleaned data found. Please complete Step 1.")
+    if not fingerprint_type: raise gr.Error("Please select a fingerprint type.")
+    progress(0, desc="Starting..."); yield f"🧪 Starting fingerprint calculation...", None, gr.update(visible=False), None, current_state
     try:
         smi_file, output_csv = 'molecule.smi', 'fingerprints.csv'
+        # DEBUG FIX: Switched to a safe merge instead of risky concat.
+        # Use canonical_smiles as the unique ID for PaDEL, since it was deduplicated in Step 1.
         input_df[['canonical_smiles', 'canonical_smiles']].to_csv(smi_file, sep='\t', index=False, header=False)
+        if os.path.exists(output_csv): os.remove(output_csv)
         descriptortypes = fp_config.get(fingerprint_type)
+        if not descriptortypes: raise gr.Error(f"Descriptor XML for '{fingerprint_type}' not found.")
+        progress(0.3, desc="⚗️ Running PaDEL..."); yield f"⚗️ Running PaDEL...", None, gr.update(visible=False), None, current_state
         padeldescriptor(mol_dir=smi_file, d_file=output_csv, descriptortypes=descriptortypes, detectaromaticity=True, standardizenitro=True, standardizetautomers=True, threads=-1, removesalt=True, log=False, fingerprints=True)
         if not os.path.exists(output_csv) or os.path.getsize(output_csv) == 0:
             raise gr.Error("PaDEL failed to produce an output file. Check molecule validity.")
+        progress(0.7, desc="📊 Processing results..."); yield "📊 Processing results...", None, gr.update(visible=False), None, current_state
         df_X = pd.read_csv(output_csv).rename(columns={'Name': 'canonical_smiles'})
+        # Safely merge fingerprints with original data. 'inner' ensures that only molecules
+        # for which fingerprints were successfully calculated are included.
         final_df = pd.merge(input_df[['canonical_smiles', 'pIC50']], df_X, on='canonical_smiles', how='inner')
+        current_state['fingerprint_data'] = final_df; current_state['fingerprint_type'] = fingerprint_type
+        progress(0.9, desc="🖼️ Generating molecule grid...")
+        mols_html = create_molecule_grid_html(final_df)
         success_msg = f"✅ Success! Generated {len(df_X.columns) -1} descriptors for {len(final_df)} molecules."
+        progress(1, desc="Completed!"); yield success_msg, final_df, gr.update(visible=True), gr.update(value=mols_html, visible=True), current_state
+    except Exception as e: raise gr.Error(f"Calculation failed: {e}")
     finally:
+        if os.path.exists('molecule.smi'): os.remove('molecule.smi')
+        if os.path.exists('fingerprints.csv'): os.remove('fingerprints.csv')
 # ==============================================================================
 # === STEP 3: MODEL TRAINING & PREDICTION FUNCTIONS ===
     X_test = pd.DataFrame(selector.transform(X_test), columns=X_test.columns[selector.get_support()], index=X_test.index)
     selected_features = X_train.columns.tolist()
+    model_defs = [('Linear Regression', LinearRegression()), ('Ridge', Ridge(random_state=42)), ('Lasso', Lasso(random_state=42)), ('Random Forest', RandomForestRegressor(random_state=42, n_jobs=-1)), ('Gradient Boosting', GradientBoostingRegressor(random_state=42))]
+    if _has_extra_libs: model_defs.extend([('XGBoost', xgb.XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)), ('LightGBM', lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbosity=-1)), ('CatBoost', cb.CatBoostRegressor(random_state=42, verbose=0))])
     results_list, trained_models = [], {}
     for i, (name, model) in enumerate(model_defs):
         progress(0.2 + (i / len(model_defs)) * 0.8, desc=f"Training {name}...")
     results_df = pd.DataFrame(results_list).sort_values(by='R²', ascending=False).reset_index(drop=True)
     plotter = ModelPlotter(trained_models, X_test, y_test)
     model_run_results = ModelRunResult(results_df, plotter, trained_models, selected_features)
     model_choices = results_df['Model'].tolist()
     yield "✅ Model training & evaluation complete.", model_run_results, gr.Dropdown(choices=model_choices, interactive=True)
+def create_prediction_grid_html(df, smiles_col='canonical_smiles', pred_col='predicted_pIC50', max_mols=20):
+    html_parts = ['<div style="display: flex; flex-wrap: wrap; gap: 10px;">']
+    for idx, row in df.head(max_mols).iterrows():
+        smiles = row[smiles_col]
+        pred_pic50 = row[pred_col]
+        if pd.isna(pred_pic50):
+            continue
+        mol = Chem.MolFromSmiles(smiles)
+        if mol:
+            # Generate molecule image
+            img = Draw.MolToImage(mol, size=(200, 200))
+            # Convert to base64
+            buffered = BytesIO()
+            img.save(buffered, format="PNG")
+            img_str = base64.b64encode(buffered.getvalue()).decode()
+            # Create HTML for this molecule
+            mol_html = f'''
+            <div style="border: 1px solid #ccc; padding: 10px; border-radius: 5px; text-align: center;">
+                <img src="data:image/png;base64,{img_str}" alt="Molecule" style="max-width: 200px;">
+                <p><strong>Predicted pIC50:</strong> {pred_pic50:.2f}</p>
+                <p style="font-size: 10px; word-break: break-all;">{smiles}</p>
+            </div>
+            '''
+            html_parts.append(mol_html)
+    html_parts.append('</div>')
+    return ''.join(html_parts)
 def predict_on_upload(uploaded_file, model_name, current_state, progress=gr.Progress()):
     if not uploaded_file: raise gr.Error("Please upload a file.")
     if not model_name: raise gr.Error("Please select a trained model.")
     model_run_results = current_state.get('model_results')
     fingerprint_type = current_state.get('fingerprint_type')
     if not model_run_results or not fingerprint_type: raise gr.Error("Please run Steps 2 and 3 first.")
     model = model_run_results.models.get(model_name)
     selected_features = model_run_results.selected_features
     if model is None: raise gr.Error(f"Model '{model_name}' not found.")
     smi_file, output_csv = 'predict.smi', 'predict_fp.csv'
     try:
         progress(0, desc="Reading & processing new molecules..."); yield "Reading uploaded file...", None, None
         df_new = pd.read_csv(uploaded_file.name)
         if 'canonical_smiles' not in df_new.columns: raise gr.Error("CSV must contain a 'canonical_smiles' column.")
         df_new = df_new.reset_index().rename(columns={'index': 'mol_id'})
         padel_input = pd.DataFrame({'smiles': df_new['canonical_smiles'], 'name': df_new['mol_id']})
         padel_input.to_csv(smi_file, sep='\t', index=False, header=False)
         if os.path.exists(output_csv): os.remove(output_csv)
         progress(0.3, desc="Calculating fingerprints..."); yield "Calculating fingerprints for new molecules...", None, None
         padeldescriptor(mol_dir=smi_file, d_file=output_csv, descriptortypes=fp_config.get(fingerprint_type), detectaromaticity=True, standardizenitro=True, threads=-1, removesalt=True, log=False, fingerprints=True)
         if not os.path.exists(output_csv) or os.path.getsize(output_csv) == 0: raise gr.Error("PaDEL calculation failed for the uploaded molecules.")
         progress(0.7, desc="Aligning features and predicting..."); yield "Aligning features and predicting...", None, None
         df_fp = pd.read_csv(output_csv).rename(columns={'Name': 'mol_id'})
         X_new = df_fp.set_index('mol_id')
         X_new_aligned = X_new.reindex(columns=selected_features, fill_value=0)[selected_features]
         predictions = model.predict(X_new_aligned)
         results_subset = pd.DataFrame({'mol_id': X_new_aligned.index, 'predicted_pIC50': predictions})
         df_results = pd.merge(df_new, results_subset, on='mol_id', how='left')
         progress(0.9, desc="Generating visualization..."); yield "Generating visualization...", None, None
+        # DEBUG FIX: The main fix for the KeyError.
+        # Create a copy, rename the column *before* calling mols2grid.
+        # This is more robust than relying on the library's 'rename' parameter.
         df_grid_view = df_results.dropna(subset=['predicted_pIC50']).copy()
+        mols_html = "<h3>No molecules with successful predictions to display.</h3>"
+        if not df_grid_view.empty:
+            mols_html = create_prediction_grid_html(df_grid_view)
         progress(1, desc="Complete!"); yield "✅ Prediction complete.", df_results[['canonical_smiles', 'predicted_pIC50']], mols_html
     finally:
         if os.path.exists(smi_file): os.remove(smi_file)
         if os.path.exists(output_csv): os.remove(output_csv)
 # ==============================================================================
 # === GRADIO INTERFACE ===
 # ==============================================================================
     app_state = gr.State({})
     with gr.Tabs():
         with gr.Tab("Step 1: Data Collection & EDA"):
+            # UI Definition for Step 1...
             gr.Markdown("## Fetch Bioactivity Data from ChEMBL and Perform Exploratory Analysis")
             with gr.Row():
                 query_input = gr.Textbox(label="Target Query", placeholder="e.g., acetylcholinesterase, BRAF kinase", scale=3)
                         hdonors_stats_output = gr.Dataframe(label="Stats for H-Donors")
                         hacceptors_stats_output = gr.Dataframe(label="Stats for H-Acceptors")
         with gr.Tab("Step 2: Feature Engineering"):
+            # UI Definition for Step 2...
             gr.Markdown("## Calculate Molecular Fingerprints using PaDEL")
             with gr.Row():
                 fingerprint_dropdown = gr.Dropdown(choices=FP_list, value='PubChem' if 'PubChem' in FP_list else None, label="Select Fingerprint Method", scale=3)
             download_s2 = gr.DownloadButton("Download Feature Data (CSV)", variant="secondary", visible=False)
             mols_grid_s2 = gr.HTML(label="Interactive Molecule Viewer")
         with gr.Tab("Step 3: Model Training & Prediction"):
+            # UI Definition for Step 3...
             gr.Markdown("## Train Regression Models and Predict pIC50")
             with gr.Tabs():
                 with gr.Tab("Model Training & Evaluation"):
     process_btn.click(fn=process_and_analyze_wrapper, inputs=[selected_target_dropdown, bioactivity_class_selector, app_state], outputs=[df_output_s1, freq_plot_output, scatter_plot_output, pic50_plot_output, pic50_stats_output, mw_plot_output, mw_stats_output, logp_plot_output, logp_stats_output, hdonors_plot_output, hdonors_stats_output, hacceptors_plot_output, hacceptors_stats_output, status_step1_process, app_state])
     bioactivity_class_selector.change(fn=update_analysis_on_filter_change, inputs=[bioactivity_class_selector, app_state], outputs=[df_output_s1, freq_plot_output, scatter_plot_output, pic50_plot_output, pic50_stats_output, mw_plot_output, mw_stats_output, logp_plot_output, logp_stats_output, hdonors_plot_output, hdonors_stats_output, hacceptors_plot_output, hacceptors_stats_output, status_step1_process], show_progress="minimal")
     calculate_fp_btn.click(fn=calculate_fingerprints, inputs=[app_state, fingerprint_dropdown], outputs=[status_step2, output_df_s2, download_s2, mols_grid_s2, app_state])
+    # The download button click handler was incorrect, it should take the dataframe from the state
     @download_s2.click(inputs=app_state, outputs=download_s2, show_progress="hidden")
     def download_handler(current_state):
         df_to_download = current_state.get('fingerprint_data')
         return save_dataframe_as_csv(df_to_download)
     train_models_btn.click(fn=handle_model_training, inputs=[app_state], outputs=[status_step3_train, model_results_df, model_selector_s3, app_state])
     for listener in [model_selector_s3.change, feature_count_s3.change]: listener(fn=update_analysis_plots, inputs=[model_selector_s3, feature_count_s3, app_state], outputs=[validation_plot_s3, feature_plot_s3], show_progress="minimal")
     predict_btn_s3.click(fn=predict_on_upload, inputs=[upload_predict_file, model_selector_s3, app_state], outputs=[status_step3_predict, prediction_results_df, prediction_mols_grid])