Spaces:

alidenewade
/

drug-discovery-app

Sleeping

App Files Files Community

drug-discovery-app / app.py

alidenewade

Update app.py

1a677a2 verified 3 months ago

raw

history blame contribute delete

35.9 kB

	# --- IMPORTS ---
	# Core and Data Handling
	import gradio as gr
	import pandas as pd
	import numpy as np
	import os
	import glob
	import time
	import warnings

	# Chemistry and Cheminformatics
	from rdkit import Chem
	from rdkit.Chem import Descriptors, Lipinski
	from chembl_webresource_client.new_client import new_client
	from padelpy import padeldescriptor
	# Removed: import mols2grid
	from rdkit.Chem.Draw import rdMolDraw2D
	from rdkit.Chem import Draw
	import base64
	from io import BytesIO


	# Plotting and Visualization
	import matplotlib.pyplot as plt
	import seaborn as sns
	from scipy import stats
	from scipy.stats import mannwhitneyu

	# Machine Learning Models and Metrics
	from sklearn.model_selection import train_test_split
	from sklearn.feature_selection import VarianceThreshold
	from sklearn.linear_model import (
	LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge,
	HuberRegressor, PassiveAggressiveRegressor, OrthogonalMatchingPursuit,
	LassoLars
	)
	from sklearn.tree import DecisionTreeRegressor
	from sklearn.ensemble import (
	RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor,
	AdaBoostRegressor
	)
	from sklearn.neighbors import KNeighborsRegressor
	from sklearn.dummy import DummyRegressor
	from sklearn.metrics import (
	mean_absolute_error, mean_squared_error, r2_score
	)

	# A placeholder class to store all results from a modeling run
	class ModelRunResult:
	def __init__(self, dataframe, plotter, models, selected_features):
	self.dataframe = dataframe
	self.plotter = plotter
	self.models = models
	self.selected_features = selected_features

	# Optional Advanced Models
	try:
	import xgboost as xgb
	import lightgbm as lgb
	import catboost as cb
	_has_extra_libs = True
	except ImportError:
	_has_extra_libs = False
	warnings.warn("Optional libraries (xgboost, lightgbm, catboost) not found. Some models will be unavailable.")

	# --- GLOBAL CONFIGURATION & SETUP ---
	warnings.filterwarnings("ignore")
	sns.set_theme(style='whitegrid')

	# --- FINGERPRINT CONFIGURATION ---
	# Create a dummy PubChem.xml if no XML files are found, to ensure fp_config is populated
	# Updated path for XML files to 'padel_descriptors/*.xml'
	padel_descriptors_dir = 'padel_descriptors'
	if not os.path.exists(padel_descriptors_dir):
	os.makedirs(padel_descriptors_dir)

	# Check for XML files within the 'padel_descriptors' folder
	xml_files = sorted(glob.glob(os.path.join(padel_descriptors_dir, '*.xml')))
	if not xml_files:
	try:
	# Create a dummy PubChem.xml inside 'padel_descriptors' if no XML files are found
	with open(os.path.join(padel_descriptors_dir, 'PubChem.xml'), 'w') as f:
	f.write('')
	xml_files = sorted(glob.glob(os.path.join(padel_descriptors_dir, '*.xml'))) # Re-scan after creating dummy
	except IOError:
	warnings.warn("Could not create a dummy 'PubChem.xml' file in 'padel_descriptors'. Fingerprint calculation might fail if no .xml files are present.")

	if not xml_files:
	warnings.warn(
	"No descriptor .xml files found in 'padel_descriptors' directory. "
	"Fingerprint calculation will not be possible. "
	"Please place descriptor XML files in the 'padel_descriptors' directory."
	)
	fp_config = {os.path.splitext(os.path.basename(file))[0]: file for file in xml_files}
	FP_list = sorted(list(fp_config.keys()))


	# ==============================================================================
	# === STEP 1: CORE DATA COLLECTION & EDA FUNCTIONS ===
	# ==============================================================================

	def get_target_chembl_id(query):
	try:
	target = new_client.target
	res = target.search(query)
	if not res:
	return pd.DataFrame(), gr.Dropdown(choices=[], value=None), "No targets found for your query."
	df = pd.DataFrame(res)
	return df[["target_chembl_id", "pref_name", "organism"]], gr.Dropdown(choices=df["target_chembl_id"].tolist()), f"Found {len(df)} targets."
	except Exception as e:
	raise gr.Error(f"ChEMBL search failed: {e}")

	def get_bioactivity_data(target_id):
	try:
	activity = new_client.activity
	res = activity.filter(target_chembl_id=target_id).filter(standard_type="IC50")
	if not res:
	return pd.DataFrame(), "No IC50 bioactivity data found for this target."
	df = pd.DataFrame(res)
	return df, f"Fetched {len(df)} data points."
	except Exception as e:
	raise gr.Error(f"Failed to fetch bioactivity data: {e}")

	def pIC50_calc(input_df):
	df_copy = input_df.copy()
	df_copy['standard_value'] = pd.to_numeric(df_copy['standard_value'], errors='coerce')
	df_copy.dropna(subset=['standard_value'], inplace=True)
	df_copy['standard_value_norm'] = df_copy['standard_value'].apply(lambda x: min(x, 100000000))
	pIC50_values = []
	for i in df_copy['standard_value_norm']:
	if pd.notna(i) and i > 0:
	molar = i * (10**-9)
	pIC50_values.append(-np.log10(molar))
	else:
	pIC50_values.append(np.nan)
	df_copy['pIC50'] = pIC50_values
	df_copy['bioactivity_class'] = df_copy['standard_value_norm'].apply(
	lambda x: "inactive" if pd.notna(x) and x >= 10000 else ("active" if pd.notna(x) and x <= 1000 else "intermediate")
	)
	return df_copy.drop(columns=['standard_value', 'standard_value_norm'])

	def lipinski_descriptors(smiles_series):
	moldata, valid_smiles = [], []
	for elem in smiles_series:
	if elem and isinstance(elem, str):
	mol = Chem.MolFromSmiles(elem)
	if mol:
	moldata.append(mol)
	valid_smiles.append(elem)
	descriptor_rows = []
	for mol in moldata:
	row = [Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Lipinski.NumHDonors(mol), Lipinski.NumHAcceptors(mol)]
	descriptor_rows.append(row)
	columnNames = ["MW", "LogP", "NumHDonors", "NumHAcceptors"]
	if not descriptor_rows: return pd.DataFrame(columns=columnNames), []
	return pd.DataFrame(data=np.array(descriptor_rows), columns=columnNames), valid_smiles

	def clean_and_process_data(df):
	if df is None or df.empty: raise gr.Error("No data to process. Please fetch data first.")
	if "canonical_smiles" not in df.columns or df["canonical_smiles"].isnull().all():
	try:
	df["canonical_smiles"] = [c.get("molecule_structures", {}).get("canonical_smiles") for c in new_client.molecule.get(list(df["molecule_chembl_id"]))]
	except Exception as e:
	raise gr.Error(f"Could not fetch SMILES from ChEMBL: {e}")
	df = df[df.standard_value.notna()]
	df = df[df.canonical_smiles.notna()]
	# DEBUG FIX: Added drop_duplicates to align with notebook logic and ensure unique SMILES for merging.
	df.drop_duplicates(['canonical_smiles'], inplace=True)
	df["standard_value"] = pd.to_numeric(df["standard_value"], errors='coerce')
	df.dropna(subset=['standard_value'], inplace=True)
	df_processed = pIC50_calc(df)
	df_processed = df_processed[df_processed.pIC50.notna()]
	if df_processed.empty: return pd.DataFrame(), "No compounds remaining after pIC50 calculation."
	df_lipinski, valid_smiles = lipinski_descriptors(df_processed['canonical_smiles'])
	if not valid_smiles: return pd.DataFrame(), "No valid SMILES could be processed for Lipinski descriptors."
	df_processed = df_processed[df_processed['canonical_smiles'].isin(valid_smiles)].reset_index(drop=True)
	df_lipinski = df_lipinski.reset_index(drop=True)
	df_final = pd.concat([df_processed, df_lipinski], axis=1)
	return df_final, f"Processing complete. {len(df_final)} compounds remain after cleaning."

	def run_eda_analysis(df, selected_classes):
	if df is None or df.empty: raise gr.Error("No data available for analysis.")
	df_filtered = df[df.bioactivity_class.isin(selected_classes)].copy()
	if df_filtered.empty: return (None, None, None, pd.DataFrame(), None, pd.DataFrame(), None, pd.DataFrame(), None, pd.DataFrame(), None, pd.DataFrame(), "No data for selected classes.")
	plots = [create_frequency_plot(df_filtered), create_scatter_plot(df_filtered)]
	stats_dfs = []
	for desc in ['pIC50', 'MW', 'LogP', 'NumHDonors', 'NumHAcceptors']:
	plots.append(create_boxplot(df_filtered, desc))
	stats_dfs.append(mannwhitney_test(df_filtered, desc))
	plt.close('all')
	return (plots[0], plots[1], plots[2], stats_dfs[0], plots[3], stats_dfs[1], plots[4], stats_dfs[2], plots[5], stats_dfs[3], plots[6], stats_dfs[4], f"EDA complete for {len(df_filtered)} compounds.")

	def create_frequency_plot(df):
	plt.figure(figsize=(5.5, 5.5)); sns.barplot(x=df['bioactivity_class'].value_counts().index, y=df['bioactivity_class'].value_counts().values, palette={'active': '#1f77b4', 'inactive': '#ff7f0e', 'intermediate': '#2ca02c'}); plt.xlabel('Bioactivity Class', fontsize=12); plt.ylabel('Frequency', fontsize=12); plt.title('Frequency of Bioactivity Classes', fontsize=14); return plt.gcf()
	def create_scatter_plot(df):
	plt.figure(figsize=(5.5, 5.5)); sns.scatterplot(data=df, x='MW', y='LogP', hue='bioactivity_class', size='pIC50', palette={'active': '#1f77b4', 'inactive': '#ff7f0e', 'intermediate': '#2ca02c'}, sizes=(20, 200), alpha=0.7); plt.xlabel('Molecular Weight (MW)', fontsize=12); plt.ylabel('LogP', fontsize=12); plt.title('Chemical Space: MW vs. LogP', fontsize=14); plt.legend(title='Bioactivity Class'); return plt.gcf()
	def create_boxplot(df, descriptor):
	plt.figure(figsize=(5.5, 5.5)); sns.boxplot(x='bioactivity_class', y=descriptor, data=df, palette={'active': '#1f77b4', 'inactive': '#ff7f0e', 'intermediate': '#2ca02c'}); plt.xlabel('Bioactivity Class', fontsize=12); plt.ylabel(descriptor, fontsize=12); plt.title(f'{descriptor} by Bioactivity Class', fontsize=14); return plt.gcf()
	def mannwhitney_test(df, descriptor):
	results = []
	for c1, c2 in [('active', 'inactive'), ('active', 'intermediate'), ('inactive', 'intermediate')]:
	if c1 in df['bioactivity_class'].unique() and c2 in df['bioactivity_class'].unique():
	d1, d2 = df[df.bioactivity_class == c1][descriptor].dropna(), df[df.bioactivity_class == c2][descriptor].dropna()
	if not d1.empty and not d2.empty:
	stat, p = mannwhitneyu(d1, d2)
	results.append({'Comparison': f'{c1.title()} vs {c2.title()}', 'Statistics': stat, 'p-value': p, 'Interpretation': 'Different distribution (p < 0.05)' if p <= 0.05 else 'Same distribution (p > 0.05)'})
	return pd.DataFrame(results)

	# ==============================================================================
	# === STEP 2: FEATURE ENGINEERING FUNCTIONS ===
	# ==============================================================================
	# Replacement for mols2grid.display in Step 2
	def create_molecule_grid_html(df, smiles_col='canonical_smiles', max_mols=20):
	html_parts = ['<div style="display: flex; flex-wrap: wrap; gap: 10px;">']
	for idx, row in df.head(max_mols).iterrows():
	smiles = row[smiles_col]
	pic50 = row['pIC50']
	mol = Chem.MolFromSmiles(smiles)
	if mol:
	# Generate molecule image
	img = Draw.MolToImage(mol, size=(200, 200))
	# Convert to base64
	buffered = BytesIO()
	img.save(buffered, format="PNG")
	img_str = base64.b64encode(buffered.getvalue()).decode()
	# Create HTML for this molecule
	mol_html = f'''
	<div style="border: 1px solid #ccc; padding: 10px; border-radius: 5px; text-align: center;">
	<img src="data:image/png;base64,{img_str}" alt="Molecule" style="max-width: 200px;">
	<p><strong>pIC50:</strong> {pic50:.2f}</p>
	<p style="font-size: 10px; word-break: break-all;">{smiles}</p>
	</div>
	'''
	html_parts.append(mol_html)
	html_parts.append('</div>')
	return ''.join(html_parts)

	def calculate_fingerprints(current_state, fingerprint_type, progress=gr.Progress()):
	input_df = current_state.get('cleaned_data')
	if input_df is None or input_df.empty: raise gr.Error("No cleaned data found. Please complete Step 1.")
	if not fingerprint_type: raise gr.Error("Please select a fingerprint type.")
	progress(0, desc="Starting..."); yield f"🧪 Starting fingerprint calculation...", None, gr.update(visible=False), None, current_state
	try:
	smi_file, output_csv = 'molecule.smi', 'fingerprints.csv'

	# DEBUG FIX: Switched to a safe merge instead of risky concat.
	# Use canonical_smiles as the unique ID for PaDEL, since it was deduplicated in Step 1.
	input_df[['canonical_smiles', 'canonical_smiles']].to_csv(smi_file, sep='\t', index=False, header=False)

	if os.path.exists(output_csv): os.remove(output_csv)
	descriptortypes = fp_config.get(fingerprint_type)
	if not descriptortypes: raise gr.Error(f"Descriptor XML for '{fingerprint_type}' not found.")

	progress(0.3, desc="⚗️ Running PaDEL..."); yield f"⚗️ Running PaDEL...", None, gr.update(visible=False), None, current_state
	padeldescriptor(mol_dir=smi_file, d_file=output_csv, descriptortypes=descriptortypes, detectaromaticity=True, standardizenitro=True, standardizetautomers=True, threads=-1, removesalt=True, log=False, fingerprints=True)
	if not os.path.exists(output_csv) or os.path.getsize(output_csv) == 0:
	raise gr.Error("PaDEL failed to produce an output file. Check molecule validity.")

	progress(0.7, desc="📊 Processing results..."); yield "📊 Processing results...", None, gr.update(visible=False), None, current_state
	df_X = pd.read_csv(output_csv).rename(columns={'Name': 'canonical_smiles'})

	# Safely merge fingerprints with original data. 'inner' ensures that only molecules
	# for which fingerprints were successfully calculated are included.
	final_df = pd.merge(input_df[['canonical_smiles', 'pIC50']], df_X, on='canonical_smiles', how='inner')

	current_state['fingerprint_data'] = final_df; current_state['fingerprint_type'] = fingerprint_type
	progress(0.9, desc="🖼️ Generating molecule grid...")
	# Replacement for mols2grid.display in Step 2
	mols_html = create_molecule_grid_html(final_df)
	success_msg = f"✅ Success! Generated {len(df_X.columns) -1} descriptors for {len(final_df)} molecules."
	progress(1, desc="Completed!"); yield success_msg, final_df, gr.update(visible=True), gr.update(value=mols_html, visible=True), current_state
	except Exception as e: raise gr.Error(f"Calculation failed: {e}")
	finally:
	if os.path.exists('molecule.smi'): os.remove('molecule.smi')
	if os.path.exists('fingerprints.csv'): os.remove('fingerprints.csv')

	# ==============================================================================
	# === STEP 3: MODEL TRAINING & PREDICTION FUNCTIONS ===
	# ==============================================================================
	class ModelPlotter:
	def __init__(self, models: dict, X_test: pd.DataFrame, y_test: pd.Series):
	self._models, self._X_test, self._y_test = models, X_test, y_test
	def plot_validation(self, model_name: str):
	if model_name not in self._models: raise ValueError(f"Model '{model_name}' not found.")
	model, y_pred = self._models[model_name], self._models[model_name].predict(self._X_test)
	residuals = self._y_test - y_pred
	fig, axes = plt.subplots(2, 2, figsize=(12, 10)); fig.suptitle(f'Model Validation Plots for {model_name}', fontsize=16, y=1.02)
	sns.scatterplot(x=self._y_test, y=y_pred, ax=axes[0, 0], alpha=0.6); axes[0, 0].set_title('Actual vs. Predicted'); axes[0, 0].set_xlabel('Actual pIC50'); axes[0, 0].set_ylabel('Predicted pIC50'); lims = [min(self._y_test.min(), y_pred.min()), max(self._y_test.max(), y_pred.max())]; axes[0, 0].plot(lims, lims, 'r--', alpha=0.75, zorder=0)
	sns.scatterplot(x=y_pred, y=residuals, ax=axes[0, 1], alpha=0.6); axes[0, 1].axhline(y=0, color='r', linestyle='--'); axes[0, 1].set_title('Residuals vs. Predicted'); axes[0, 1].set_xlabel('Predicted pIC50'); axes[0, 1].set_ylabel('Residuals')
	sns.histplot(residuals, kde=True, ax=axes[1, 0]); axes[1, 0].set_title('Distribution of Residuals')
	stats.probplot(residuals, dist="norm", plot=axes[1, 1]); axes[1, 1].set_title('Normal Q-Q Plot')
	plt.tight_layout(); return fig
	def plot_feature_importance(self, model_name: str, top_n: int = 7):
	if model_name not in self._models: raise ValueError(f"Model '{model_name}' not found.")
	model = self._models[model_name]
	if hasattr(model, 'feature_importances_'): importances = model.feature_importances_
	elif hasattr(model, 'coef_'): importances = np.abs(model.coef_)
	else: return None
	top_features = pd.DataFrame({'Feature': self._X_test.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False).head(top_n)
	plt.figure(figsize=(10, top_n * 0.5)); sns.barplot(x='Importance', y='Feature', data=top_features, palette='viridis', orient='h'); plt.title(f'Top {top_n} Features for {model_name}'); plt.tight_layout(); return plt.gcf()

	def run_regression_suite(df: pd.DataFrame, progress=gr.Progress()):
	progress(0, desc="Splitting data..."); yield "Splitting data (80/20 train/test split)...", None, None
	X = df.drop(columns=['pIC50', 'canonical_smiles'], errors='ignore')
	y = df['pIC50']
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	progress(0.1, desc="Selecting features..."); yield "Performing feature selection (removing low variance)...", None, None
	selector = VarianceThreshold(threshold=0.1)
	X_train = pd.DataFrame(selector.fit_transform(X_train), columns=X_train.columns[selector.get_support()], index=X_train.index)
	X_test = pd.DataFrame(selector.transform(X_test), columns=X_test.columns[selector.get_support()], index=X_test.index)
	selected_features = X_train.columns.tolist()

	model_defs = [('Linear Regression', LinearRegression()), ('Ridge', Ridge(random_state=42)), ('Lasso', Lasso(random_state=42)), ('Random Forest', RandomForestRegressor(random_state=42, n_jobs=-1)), ('Gradient Boosting', GradientBoostingRegressor(random_state=42))]
	if _has_extra_libs: model_defs.extend([('XGBoost', xgb.XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)), ('LightGBM', lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbosity=-1)), ('CatBoost', cb.CatBoostRegressor(random_state=42, verbose=0))])

	results_list, trained_models = [], {}
	for i, (name, model) in enumerate(model_defs):
	progress(0.2 + (i / len(model_defs)) * 0.8, desc=f"Training {name}...")
	yield f"Training {i+1}/{len(model_defs)}: {name}...", None, None
	start_time = time.time(); model.fit(X_train, y_train); y_pred = model.predict(X_test)
	results_list.append({'Model': name, 'R²': r2_score(y_test, y_pred), 'MAE': mean_absolute_error(y_test, y_pred), 'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)), 'Time (s)': f"{time.time() - start_time:.2f}"})
	trained_models[name] = model

	results_df = pd.DataFrame(results_list).sort_values(by='R²', ascending=False).reset_index(drop=True)
	plotter = ModelPlotter(trained_models, X_test, y_test)
	model_run_results = ModelRunResult(results_df, plotter, trained_models, selected_features)

	model_choices = results_df['Model'].tolist()
	yield "✅ Model training & evaluation complete.", model_run_results, gr.Dropdown(choices=model_choices, interactive=True)

	# Replacement for mols2grid.display in Step 3
	def create_prediction_grid_html(df, smiles_col='canonical_smiles', pred_col='predicted_pIC50', max_mols=20):
	html_parts = ['<div style="display: flex; flex-wrap: wrap; gap: 10px;">']
	for idx, row in df.head(max_mols).iterrows():
	smiles = row[smiles_col]
	pred_pic50 = row[pred_col]
	if pd.isna(pred_pic50):
	continue
	mol = Chem.MolFromSmiles(smiles)
	if mol:
	# Generate molecule image
	img = Draw.MolToImage(mol, size=(200, 200))
	# Convert to base64
	buffered = BytesIO()
	img.save(buffered, format="PNG")
	img_str = base64.b64encode(buffered.getvalue()).decode()
	# Create HTML for this molecule
	mol_html = f'''
	<div style="border: 1px solid #ccc; padding: 10px; border-radius: 5px; text-align: center;">
	<img src="data:image/png;base64,{img_str}" alt="Molecule" style="max-width: 200px;">
	<p><strong>Predicted pIC50:</strong> {pred_pic50:.2f}</p>
	<p style="font-size: 10px; word-break: break-all;">{smiles}</p>
	</div>
	'''
	html_parts.append(mol_html)
	html_parts.append('</div>')
	return ''.join(html_parts)

	def predict_on_upload(uploaded_file, model_name, current_state, progress=gr.Progress()):
	if not uploaded_file: raise gr.Error("Please upload a file.")
	if not model_name: raise gr.Error("Please select a trained model.")
	model_run_results = current_state.get('model_results')
	fingerprint_type = current_state.get('fingerprint_type')
	if not model_run_results or not fingerprint_type: raise gr.Error("Please run Steps 2 and 3 first.")

	model = model_run_results.models.get(model_name)
	selected_features = model_run_results.selected_features
	if model is None: raise gr.Error(f"Model '{model_name}' not found.")

	smi_file, output_csv = 'predict.smi', 'predict_fp.csv'
	try:
	progress(0, desc="Reading & processing new molecules..."); yield "Reading uploaded file...", None, None
	df_new = pd.read_csv(uploaded_file.name)
	if 'canonical_smiles' not in df_new.columns: raise gr.Error("CSV must contain a 'canonical_smiles' column.")
	df_new = df_new.reset_index().rename(columns={'index': 'mol_id'})

	padel_input = pd.DataFrame({'smiles': df_new['canonical_smiles'], 'name': df_new['mol_id']})
	padel_input.to_csv(smi_file, sep='\t', index=False, header=False)
	if os.path.exists(output_csv): os.remove(output_csv)

	progress(0.3, desc="Calculating fingerprints..."); yield "Calculating fingerprints for new molecules...", None, None
	padeldescriptor(mol_dir=smi_file, d_file=output_csv, descriptortypes=fp_config.get(fingerprint_type), detectaromaticity=True, standardizenitro=True, threads=-1, removesalt=True, log=False, fingerprints=True)
	if not os.path.exists(output_csv) or os.path.getsize(output_csv) == 0: raise gr.Error("PaDEL calculation failed for the uploaded molecules.")

	progress(0.7, desc="Aligning features and predicting..."); yield "Aligning features and predicting...", None, None
	df_fp = pd.read_csv(output_csv).rename(columns={'Name': 'mol_id'})

	X_new = df_fp.set_index('mol_id')
	X_new_aligned = X_new.reindex(columns=selected_features, fill_value=0)[selected_features]

	predictions = model.predict(X_new_aligned)

	results_subset = pd.DataFrame({'mol_id': X_new_aligned.index, 'predicted_pIC50': predictions})
	df_results = pd.merge(df_new, results_subset, on='mol_id', how='left')

	progress(0.9, desc="Generating visualization..."); yield "Generating visualization...", None, None

	# Replacement for mols22grid.display in Step 3
	df_grid_view = df_results.dropna(subset=['predicted_pIC50']).copy()
	mols_html = "<h3>No molecules with successful predictions to display.</h3>"
	if not df_grid_view.empty:
	mols_html = create_prediction_grid_html(df_grid_view)

	progress(1, desc="Complete!"); yield "✅ Prediction complete.", df_results[['canonical_smiles', 'predicted_pIC50']], mols_html
	finally:
	if os.path.exists(smi_file): os.remove(smi_file)
	if os.path.exists(output_csv): os.remove(output_csv)

	# ==============================================================================
	# === GRADIO INTERFACE ===
	# ==============================================================================
	with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky"), title="Comprehensive Drug Discovery Workflow") as demo:
	gr.Markdown("# 🧪 Comprehensive Drug Discovery Workflow")
	gr.Markdown("A 3-step application to fetch, analyze, and model chemical bioactivity data.")
	app_state = gr.State({})
	with gr.Tabs():
	with gr.Tab("Step 1: Data Collection & EDA"):
	gr.Markdown("## Fetch Bioactivity Data from ChEMBL and Perform Exploratory Analysis")
	gr.Markdown(
	"This app allows you to fetch bioactivity data, perform exploratory data analysis, "
	"engineer molecular features, and train machine learning models for drug discovery. "
	"For an efficient example, use 'coronavirus' as the target query and select 'CHEMBL3927'."
	)
	with gr.Row():
	query_input = gr.Textbox(label="Target Query", placeholder="e.g., acetylcholinesterase, BRAF kinase", scale=3)
	fetch_btn = gr.Button("Fetch Targets", variant="primary", scale=1)
	status_step1_fetch = gr.Textbox(label="Status", interactive=False)
	target_id_table = gr.Dataframe(label="Available Targets", interactive=False, headers=["target_chembl_id", "pref_name", "organism"])
	with gr.Row():
	selected_target_dropdown = gr.Dropdown(label="Select Target ChEMBL ID", interactive=True, scale=3)
	process_btn = gr.Button("Process Data & Run EDA", variant="primary", scale=1, interactive=False)
	status_step1_process = gr.Textbox(label="Status", interactive=False)
	gr.Markdown("### Filtered Data & Analysis")
	bioactivity_class_selector = gr.CheckboxGroup(["active", "inactive", "intermediate"], label="Filter by Bioactivity Class", value=["active", "inactive", "intermediate"])
	df_output_s1 = gr.Dataframe(label="Cleaned Bioactivity Data")
	with gr.Tabs():
	with gr.Tab("Chemical Space Overview"):
	with gr.Row():
	freq_plot_output = gr.Plot(label="Frequency of Bioactivity Classes")
	scatter_plot_output = gr.Plot(label="Scatter Plot: MW vs LogP")
	with gr.Tab("pIC50 Analysis"):
	with gr.Row():
	pic50_plot_output = gr.Plot(label="pIC50 Box Plot")
	pic50_stats_output = gr.Dataframe(label="Mann-Whitney U Test Results for pIC50")
	with gr.Tab("Molecular Weight Analysis"):
	with gr.Row():
	mw_plot_output = gr.Plot(label="MW Box Plot")
	mw_stats_output = gr.Dataframe(label="Mann-Whitney U Test Results for MW")
	with gr.Tab("LogP Analysis"):
	with gr.Row():
	logp_plot_output = gr.Plot(label="LogP Box Plot")
	logp_stats_output = gr.Dataframe(label="Mann-Whitney U Test Results for LogP")
	with gr.Tab("H-Bond Donor/Acceptor Analysis"):
	with gr.Row():
	hdonors_plot_output = gr.Plot(label="H-Donors Box Plot")
	hacceptors_plot_output = gr.Plot(label="H-Acceptors Box Plot")
	with gr.Row():
	hdonors_stats_output = gr.Dataframe(label="Stats for H-Donors")
	hacceptors_stats_output = gr.Dataframe(label="Stats for H-Acceptors")
	with gr.Tab("Step 2: Feature Engineering"):
	# UI Definition for Step 2...
	gr.Markdown("## Calculate Molecular Fingerprints using PaDEL")
	with gr.Row():
	fingerprint_dropdown = gr.Dropdown(choices=FP_list, value='PubChem' if 'PubChem' in FP_list else None, label="Select Fingerprint Method", scale=3)
	calculate_fp_btn = gr.Button("Calculate Fingerprints", variant="primary", scale=1)
	status_step2 = gr.Textbox(label="Status", interactive=False)
	output_df_s2 = gr.Dataframe(label="Final Processed Data", wrap=True)
	download_s2 = gr.DownloadButton("Download Feature Data (CSV)", variant="secondary", visible=False)
	mols_grid_s2 = gr.HTML(label="Interactive Molecule Viewer")
	with gr.Tab("Step 3: Model Training & Prediction"):
	# UI Definition for Step 3...
	gr.Markdown("## Train Regression Models and Predict pIC50")
	with gr.Tabs():
	with gr.Tab("Model Training & Evaluation"):
	train_models_btn = gr.Button("Train All Models", variant="primary")
	status_step3_train = gr.Textbox(label="Status", interactive=False)
	model_results_df = gr.DataFrame(label="Ranked Model Results", interactive=False)
	with gr.Row():
	model_selector_s3 = gr.Dropdown(label="Select Model to Analyze", interactive=False)
	feature_count_s3 = gr.Number(label="Top Features to Show", value=7, minimum=3, maximum=20, step=1)
	with gr.Tabs():
	with gr.Tab("Validation Plots"): validation_plot_s3 = gr.Plot(label="Model Validation Plots")
	with gr.Tab("Feature Importance"): feature_plot_s3 = gr.Plot(label="Top Feature Importances")
	with gr.Tab("Predict on New Data"):
	gr.Markdown("Upload a CSV with a `canonical_smiles` column to predict pIC50.")
	with gr.Row():
	upload_predict_file = gr.File(label="Upload CSV for Prediction", file_types=[".csv"])
	predict_btn_s3 = gr.Button("Run Prediction", variant="primary")
	status_step3_predict = gr.Textbox(label="Status", interactive=False)
	prediction_results_df = gr.DataFrame(label="Prediction Results")
	prediction_mols_grid = gr.HTML(label="Interactive Molecular Grid of Predictions")

	# --- EVENT HANDLERS ---
	def enable_process_button(target_id): return gr.update(interactive=bool(target_id))
	def process_and_analyze_wrapper(target_id, selected_classes, current_state, progress=gr.Progress()):
	if not target_id: raise gr.Error("Please select a target ChEMBL ID first.")
	progress(0, desc="Fetching data..."); raw_data, msg1 = get_bioactivity_data(target_id); yield {status_step1_process: gr.update(value=msg1)}
	progress(0.3, desc="Cleaning data..."); processed_data, msg2 = clean_and_process_data(raw_data); yield {df_output_s1: processed_data, status_step1_process: gr.update(value=msg2)}
	current_state['cleaned_data'] = processed_data
	progress(0.6, desc="Running EDA..."); plots_and_stats = run_eda_analysis(processed_data, selected_classes); msg3 = plots_and_stats[-1]
	progress(1, desc="Done!")
	filtered_data = processed_data[processed_data.bioactivity_class.isin(selected_classes)] if not processed_data.empty else pd.DataFrame()
	outputs = [filtered_data] + list(plots_and_stats[:-1]) + [msg3, current_state]
	output_components = [df_output_s1, freq_plot_output, scatter_plot_output, pic50_plot_output, pic50_stats_output, mw_plot_output, mw_stats_output, logp_plot_output, logp_stats_output, hdonors_plot_output, hdonors_stats_output, hacceptors_plot_output, hacceptors_stats_output, status_step1_process, app_state]
	yield dict(zip(output_components, outputs))
	def update_analysis_on_filter_change(selected_classes, current_state):
	cleaned_data = current_state.get('cleaned_data')
	if cleaned_data is None or cleaned_data.empty: return (pd.DataFrame(),) + (None,) * 11 + ("No data available.",)
	plots_and_stats = run_eda_analysis(cleaned_data, selected_classes); msg = plots_and_stats[-1]
	filtered_data = cleaned_data[cleaned_data.bioactivity_class.isin(selected_classes)]
	return (filtered_data,) + plots_and_stats[:-1] + (msg,)
	def handle_model_training(current_state, progress=gr.Progress(track_tqdm=True)):
	fingerprint_data = current_state.get('fingerprint_data')
	if fingerprint_data is None or fingerprint_data.empty: raise gr.Error("No feature data. Please complete Step 2.")
	for status_msg, model_results, model_choices_update in run_regression_suite(fingerprint_data, progress=progress):
	if model_results: current_state['model_results'] = model_results
	yield status_msg, model_results.dataframe if model_results else None, model_choices_update, current_state
	def save_dataframe_as_csv(df):
	if df is None or df.empty: return None
	filename = "feature_engineered_data.csv"; df.to_csv(filename, index=False); return gr.File(value=filename, visible=True)
	def update_analysis_plots(model_name, feature_count, current_state):
	model_results = current_state.get('model_results')
	if not model_results or not model_name: return None, None
	plotter = model_results.plotter; validation_fig = plotter.plot_validation(model_name); feature_fig = plotter.plot_feature_importance(model_name, int(feature_count)); plt.close('all'); return validation_fig, feature_fig

	fetch_btn.click(fn=get_target_chembl_id, inputs=query_input, outputs=[target_id_table, selected_target_dropdown, status_step1_fetch], show_progress="minimal")
	selected_target_dropdown.change(fn=enable_process_button, inputs=selected_target_dropdown, outputs=process_btn, show_progress="hidden")
	process_btn.click(fn=process_and_analyze_wrapper, inputs=[selected_target_dropdown, bioactivity_class_selector, app_state], outputs=[df_output_s1, freq_plot_output, scatter_plot_output, pic50_plot_output, pic50_stats_output, mw_plot_output, mw_stats_output, logp_plot_output, logp_stats_output, hdonors_plot_output, hdonors_stats_output, hacceptors_plot_output, hacceptors_stats_output, status_step1_process, app_state])
	bioactivity_class_selector.change(fn=update_analysis_on_filter_change, inputs=[bioactivity_class_selector, app_state], outputs=[df_output_s1, freq_plot_output, scatter_plot_output, pic50_plot_output, pic50_stats_output, mw_plot_output, mw_stats_output, logp_plot_output, logp_stats_output, hdonors_plot_output, hdonors_stats_output, hacceptors_plot_output, hacceptors_stats_output, status_step1_process], show_progress="minimal")
	calculate_fp_btn.click(fn=calculate_fingerprints, inputs=[app_state, fingerprint_dropdown], outputs=[status_step2, output_df_s2, download_s2, mols_grid_s2, app_state])
	# The download button click handler was incorrect, it should take the dataframe from the state
	@download_s2.click(inputs=app_state, outputs=download_s2, show_progress="hidden")
	def download_handler(current_state):
	df_to_download = current_state.get('fingerprint_data')
	return save_dataframe_as_csv(df_to_download)
	train_models_btn.click(fn=handle_model_training, inputs=[app_state], outputs=[status_step3_train, model_results_df, model_selector_s3, app_state])
	for listener in [model_selector_s3.change, feature_count_s3.change]: listener(fn=update_analysis_plots, inputs=[model_selector_s3, feature_count_s3, app_state], outputs=[validation_plot_s3, feature_plot_s3], show_progress="minimal")
	predict_btn_s3.click(fn=predict_on_upload, inputs=[upload_predict_file, model_selector_s3, app_state], outputs=[status_step3_predict, prediction_results_df, prediction_mols_grid])

	if __name__ == "__main__":
	demo.launch(debug=True)