|
|
|
|
|
|
|
|
|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import requests |
|
import io |
|
import re |
|
from PIL import Image |
|
import base64 |
|
|
|
|
|
from rdkit import Chem |
|
from rdkit.Chem import Draw, AllChem, Descriptors |
|
from Bio import SeqIO |
|
|
|
|
|
from sklearn.ensemble import RandomForestClassifier |
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
import py3Dmol |
|
|
|
|
|
import warnings |
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
st.set_page_config( |
|
page_title="AI Drug Discovery Pipeline", |
|
page_icon="π¬", |
|
layout="wide", |
|
initial_sidebar_state="collapsed", |
|
) |
|
|
|
|
|
def apply_custom_styling(): |
|
st.markdown( |
|
""" |
|
<style> |
|
@import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap'); |
|
|
|
html, body, [class*="st-"] { |
|
font-family: 'Roboto', sans-serif; |
|
} |
|
|
|
.stApp { |
|
background-color: rgb(28, 28, 28); |
|
color: white; |
|
} |
|
|
|
/* Tab styles */ |
|
.stTabs [data-baseweb="tab-list"] { |
|
gap: 24px; |
|
} |
|
|
|
.stTabs [data-baseweb="tab"] { |
|
height: 50px; |
|
white-space: pre-wrap; |
|
background: none; |
|
border-radius: 0px; |
|
border-bottom: 2px solid #333; |
|
padding: 10px 4px; |
|
color: #AAA; |
|
} |
|
|
|
.stTabs [data-baseweb="tab"]:hover { |
|
background: #222; |
|
color: #FFF; |
|
} |
|
|
|
.stTabs [aria-selected="true"] { |
|
border-bottom: 2px solid #00A0FF; /* Highlight color for active tab */ |
|
color: #FFF; |
|
} |
|
|
|
/* Button styles */ |
|
.stButton>button { |
|
border-color: #00A0FF; |
|
color: #00A0FF; |
|
} |
|
|
|
.stButton>button:hover { |
|
border-color: #FFF; |
|
color: #FFF; |
|
background-color: #00A0FF; |
|
} |
|
|
|
</style> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
|
|
apply_custom_styling() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@st.cache_data(show_spinner="Fetching PDB structure...") |
|
def fetch_pdb_structure(pdb_id: str): |
|
""" |
|
Fetches a PDB file and returns its content. |
|
""" |
|
log = "" |
|
try: |
|
url = f"https://files.rcsb.org/download/{pdb_id}.pdb" |
|
response = requests.get(url, timeout=20) |
|
if response.status_code == 200: |
|
log += f"β
Successfully fetched PDB data for {pdb_id}.\n" |
|
return response.text, log |
|
else: |
|
log += f"β οΈ Failed to fetch PDB file for {pdb_id} (Status: {response.status_code}). Please check the PDB ID and try again.\n" |
|
return None, log |
|
except Exception as e: |
|
log += f"β An error occurred while fetching PDB data: {e}\n" |
|
return None, log |
|
|
|
@st.cache_data(show_spinner="Fetching FASTA sequence...") |
|
def fetch_fasta_sequence(protein_id: str): |
|
""" |
|
Fetches a protein's FASTA sequence from NCBI. |
|
""" |
|
log = "" |
|
try: |
|
url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id={protein_id}&rettype=fasta&retmode=text" |
|
response = requests.get(url, timeout=20) |
|
if response.status_code == 200: |
|
parsed_fasta = SeqIO.read(io.StringIO(response.text), "fasta") |
|
log += f"β
Successfully fetched FASTA sequence for {protein_id}.\n\n" |
|
log += f"--- Protein Sequence Information ---\n" |
|
log += f"ID: {parsed_fasta.id}\n" |
|
log += f"Description: {parsed_fasta.description}\n" |
|
log += f"Sequence Length: {len(parsed_fasta.seq)}\n" |
|
log += f"Sequence Preview: {parsed_fasta.seq[:60]}...\n" |
|
return log |
|
else: |
|
log += f"β οΈ Failed to fetch FASTA file (Status: {response.status_code}).\n" |
|
return log |
|
except Exception as e: |
|
log += f"β An error occurred while fetching FASTA data: {e}\n" |
|
return log |
|
|
|
def visualize_protein_3d(pdb_data: str, title="Protein 3D Structure"): |
|
""" |
|
Generates an interactive 3D protein visualization using py3Dmol. |
|
""" |
|
if not pdb_data: |
|
return None, "Cannot generate 3D view: No PDB data provided." |
|
try: |
|
viewer = py3Dmol.view(width='100%', height=600) |
|
viewer.setBackgroundColor('#1C1C1C') |
|
viewer.addModel(pdb_data, "pdb") |
|
viewer.setStyle({'cartoon': {'color': 'spectrum', 'thickness': 0.8}}) |
|
viewer.addSurface(py3Dmol.VDW, {'opacity': 0.3, 'color': 'lightblue'}) |
|
viewer.zoomTo() |
|
html = viewer._make_html() |
|
log = f"β
Generated 3D visualization for {title}." |
|
return html, log |
|
except Exception as e: |
|
return None, f"β 3D visualization error: {e}" |
|
|
|
def create_sample_molecules(): |
|
""" |
|
Returns a list of sample SMILES strings for initial analysis. |
|
""" |
|
return [ |
|
"CC(=O)N[C@@H]1[C@@H](N)C=C(C(=O)O)O[C@H]1[C@H](O)[C@H](O)CO", |
|
"CC(C)C[C@H](NC(=O)C)C(=O)N[C@@H]1[C@@H](O)C=C(C(=O)O)O[C@H]1[C@H](O)[C@H](O)CO", |
|
"CC(C)CCCCCCCCCCCCCCCCCCCCCCCCCCCCC(=O)O", |
|
"CCO", |
|
] |
|
|
|
def calculate_molecular_properties(smiles_list: list): |
|
""" |
|
Calculates key physicochemical properties for a list of molecules using RDKit. |
|
""" |
|
properties = [] |
|
log = "" |
|
for i, smiles in enumerate(smiles_list): |
|
mol = Chem.MolFromSmiles(smiles) |
|
if mol: |
|
props = { |
|
'Molecule': f'Compound_{i+1}', |
|
'SMILES': smiles, |
|
'MW': Descriptors.MolWt(mol), |
|
'LogP': Descriptors.MolLogP(mol), |
|
'HBD': Descriptors.NumHDonors(mol), |
|
'HBA': Descriptors.NumHAcceptors(mol), |
|
'TPSA': Descriptors.TPSA(mol), |
|
'RotBonds': Descriptors.NumRotatableBonds(mol), |
|
} |
|
properties.append(props) |
|
else: |
|
log += f"β οΈ Invalid SMILES string skipped: {smiles}\n" |
|
|
|
df = pd.DataFrame(properties).round(2) |
|
log += f"β
Calculated properties for {len(df)} valid molecules.\n" |
|
return df, log |
|
|
|
def assess_drug_likeness(df: pd.DataFrame): |
|
""" |
|
Assesses drug-likeness based on Lipinski's Rule of Five. |
|
This version returns a boolean for plotting and a formatted string for display. |
|
""" |
|
if df.empty: |
|
return pd.DataFrame(), pd.DataFrame(), "Cannot assess drug-likeness: No properties data." |
|
|
|
|
|
analysis_df = df.copy() |
|
analysis_df['MW_OK'] = analysis_df['MW'] <= 500 |
|
analysis_df['LogP_OK'] = analysis_df['LogP'] <= 5 |
|
analysis_df['HBD_OK'] = analysis_df['HBD'] <= 5 |
|
analysis_df['HBA_OK'] = analysis_df['HBA'] <= 10 |
|
analysis_df['Lipinski_Violations'] = (~analysis_df[['MW_OK', 'LogP_OK', 'HBD_OK', 'HBA_OK']]).sum(axis=1) |
|
|
|
|
|
analysis_df['Drug_Like'] = analysis_df['Lipinski_Violations'] <= 1 |
|
|
|
|
|
display_df = df.copy() |
|
display_df['Lipinski_Violations'] = analysis_df['Lipinski_Violations'] |
|
display_df['Drug_Like'] = analysis_df['Drug_Like'].apply(lambda x: 'β
Yes' if x else 'β No') |
|
|
|
log = "β
Assessed drug-likeness using Lipinski's Rule of Five.\n" |
|
|
|
|
|
return analysis_df, display_df, log |
|
|
|
|
|
def plot_properties_dashboard(df: pd.DataFrame): |
|
""" |
|
Creates a 2x2 dashboard of molecular property visualizations. |
|
This version expects a boolean 'Drug_Like' column. |
|
""" |
|
if df.empty or 'Drug_Like' not in df.columns: |
|
return None, "Cannot plot: No analysis data or 'Drug_Like' column missing." |
|
|
|
|
|
if df['Drug_Like'].dtype != bool: |
|
return None, f"Cannot plot: 'Drug_Like' column must be boolean, but it is {df['Drug_Like'].dtype}." |
|
|
|
plt.style.use('dark_background') |
|
fig, axes = plt.subplots(2, 2, figsize=(12, 10)) |
|
fig.suptitle("Molecular Properties Analysis", fontsize=16) |
|
|
|
fig.patch.set_facecolor('none') |
|
for ax_row in axes: |
|
for ax in ax_row: |
|
ax.set_facecolor('none') |
|
|
|
|
|
color_map = {True: 'green', False: 'red'} |
|
|
|
axes[0,0].scatter(df['MW'], df['LogP'], c=df['Drug_Like'].map(color_map), s=80, alpha=0.7) |
|
axes[0,0].set_title('Molecular Weight vs LogP') |
|
axes[0,0].set_xlabel('Molecular Weight (Da)') |
|
axes[0,0].set_ylabel('LogP') |
|
axes[0,0].axvline(500, color='r', linestyle='--', alpha=0.6, label='MW < 500') |
|
axes[0,0].axhline(5, color='r', linestyle='--', alpha=0.6, label='LogP < 5') |
|
axes[0,0].legend() |
|
|
|
axes[0,1].scatter(df['HBD'], df['HBA'], c=df['Drug_Like'].map(color_map), s=80, alpha=0.7) |
|
axes[0,1].set_title('Hydrogen Bonding Properties') |
|
axes[0,1].set_xlabel('Hydrogen Bond Donors') |
|
axes[0,1].set_ylabel('Hydrogen Bond Acceptors') |
|
axes[0,1].axvline(5, color='r', linestyle='--', alpha=0.6, label='HBD < 5') |
|
axes[0,1].axhline(10, color='r', linestyle='--', alpha=0.6, label='HBA < 10') |
|
axes[0,1].legend() |
|
|
|
axes[1,0].scatter(df['TPSA'], df['RotBonds'], c=df['Drug_Like'].map(color_map), s=80, alpha=0.7) |
|
axes[1,0].set_title('TPSA vs Flexibility') |
|
axes[1,0].set_xlabel('Topological Polar Surface Area (Γ
Β²)') |
|
axes[1,0].set_ylabel('Rotatable Bonds') |
|
|
|
drug_like_counts = df['Drug_Like'].value_counts() |
|
labels = ['Drug-like' if i else 'Non-drug-like' for i in drug_like_counts.index] |
|
colors = ['green' if i else 'red' for i in drug_like_counts.index] |
|
axes[1,1].pie(drug_like_counts.values, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90) |
|
axes[1,1].set_title('Drug-likeness Distribution') |
|
|
|
plt.tight_layout(rect=[0, 0, 1, 0.96]) |
|
return fig, "β
Generated properties dashboard." |
|
|
|
|
|
def get_phase2_molecules(): |
|
return { |
|
'Oseltamivir (Tamiflu)': "CCC(CC)O[C@H]1[C@H]([C@@H]([C@H](C=C1C(=O)OCC)N)N)NC(=O)C", |
|
'Zanamivir (Relenza)': "C[C@H](N)C(=O)N[C@H]1[C@@H](O)C=C(O[C@H]1[C@@H](O)[C@H](O)CO)C(O)=O", |
|
'Aspirin': "CC(=O)OC1=CC=CC=C1C(=O)O", |
|
'Ibuprofen': "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", |
|
} |
|
|
|
def simulate_virtual_screening(smiles_dict: dict): |
|
np.random.seed(42) |
|
scores = np.random.uniform(2.0, 9.8, len(smiles_dict)) |
|
results = [{'Molecule': name, 'SMILES': smiles, 'Predicted_Binding_Affinity': round(score, 2)} for (name, smiles), score in zip(smiles_dict.items(), scores)] |
|
df = pd.DataFrame(results).sort_values('Predicted_Binding_Affinity', ascending=False).reset_index(drop=True) |
|
df['Ranking'] = df.index + 1 |
|
return df, f"β
Simulated virtual screening for {len(df)} molecules.\n" |
|
|
|
def predict_admet_properties(smiles_dict: dict): |
|
admet_data = [] |
|
log = "" |
|
for i, (name, smiles) in enumerate(smiles_dict.items()): |
|
mol = Chem.MolFromSmiles(smiles) |
|
if not mol: continue |
|
mw, logp, hbd, hba = Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Descriptors.NumHDonors(mol), Descriptors.NumHAcceptors(mol) |
|
np.random.seed(42 + i) |
|
admet_data.append({'Molecule': name, 'MW': round(mw, 2), 'LogP': round(logp, 2), 'HBD': hbd, 'HBA': hba, |
|
'Solubility (logS)': round(np.random.uniform(-4, -1), 2), |
|
'Toxicity Risk': round(np.random.uniform(0.05, 0.4), 3), |
|
'Lipinski Violations': sum([mw > 500, logp > 5, hbd > 5, hba > 10])}) |
|
df = pd.DataFrame(admet_data) |
|
log += f"β
Predicted ADMET properties for {len(df)} molecules.\n" |
|
return df, log |
|
|
|
|
|
|
|
def visualize_molecule_2d_3d(smiles: str, name: str): |
|
"""Generates a side-by-side 2D SVG and 3D py3Dmol HTML view for a single molecule.""" |
|
log = "" |
|
try: |
|
mol = Chem.MolFromSmiles(smiles) |
|
if not mol: return f"<p>Invalid SMILES for {name}</p>", f"β Invalid SMILES for {name}" |
|
|
|
drawer = Draw.rdMolDraw2D.MolDraw2DSVG(400, 300) |
|
|
|
drawer.drawOptions().clearBackground = False |
|
drawer.drawOptions().addStereoAnnotation = True |
|
drawer.drawOptions().baseFontSize = 0.8 |
|
drawer.drawOptions().circleAtoms = False |
|
drawer.drawOptions().highlightColour = (1, 0.5, 0) |
|
|
|
|
|
drawer.drawOptions().backgroundColour = (0.11, 0.11, 0.11) |
|
drawer.drawOptions().symbolColour = (1, 1, 1) |
|
drawer.drawOptions().defaultColour = (1, 1, 1) |
|
|
|
|
|
try: |
|
drawer.drawOptions().annotationColour = (1, 1, 1) |
|
except: |
|
pass |
|
|
|
drawer.DrawMolecule(mol) |
|
drawer.FinishDrawing() |
|
svg_2d = drawer.GetDrawingText().replace('svg:', '') |
|
|
|
|
|
|
|
|
|
svg_2d = svg_2d.replace('stroke="black"', 'stroke="white"') |
|
svg_2d = svg_2d.replace('fill="black"', 'fill="white"') |
|
svg_2d = svg_2d.replace('stroke="#000000"', 'stroke="#FFFFFF"') |
|
svg_2d = svg_2d.replace('fill="#000000"', 'fill="#FFFFFF"') |
|
svg_2d = svg_2d.replace('stroke="#000"', 'stroke="#FFF"') |
|
svg_2d = svg_2d.replace('fill="#000"', 'fill="#FFF"') |
|
svg_2d = svg_2d.replace('stroke:black', 'stroke:white') |
|
svg_2d = svg_2d.replace('fill:black', 'fill:white') |
|
svg_2d = svg_2d.replace('stroke:#000000', 'stroke:#FFFFFF') |
|
svg_2d = svg_2d.replace('fill:#000000', 'fill:#FFFFFF') |
|
svg_2d = svg_2d.replace('stroke:#000', 'stroke:#FFF') |
|
svg_2d = svg_2d.replace('fill:#000', 'fill:#FFF') |
|
svg_2d = svg_2d.replace('stroke="rgb(0,0,0)"', 'stroke="rgb(255,255,255)"') |
|
svg_2d = svg_2d.replace('fill="rgb(0,0,0)"', 'fill="rgb(255,255,255)"') |
|
svg_2d = svg_2d.replace('stroke:rgb(0,0,0)', 'stroke:rgb(255,255,255)') |
|
svg_2d = svg_2d.replace('fill:rgb(0,0,0)', 'fill:rgb(255,255,255)') |
|
svg_2d = svg_2d.replace('color="black"', 'color="white"') |
|
svg_2d = svg_2d.replace('color:#000000', 'color:#FFFFFF') |
|
svg_2d = svg_2d.replace('color:#000', 'color:#FFF') |
|
|
|
|
|
|
|
svg_2d = re.sub(r'<text([^>]*?)\s+fill="[^"]*"([^>]*?)>', r'<text\1\2 fill="white">', svg_2d) |
|
svg_2d = re.sub(r'<text([^>]*?)(?<!fill="white")>', r'<text\1 fill="white">', svg_2d) |
|
|
|
|
|
svg_2d = re.sub(r'<text([^>]*?)style="([^"]*?)fill:\s*(?:black|#000000|#000|rgb\(0,0,0\))([^"]*?)"([^>]*?)>', |
|
r'<text\1style="\2fill:white\3"\4>', svg_2d) |
|
|
|
|
|
svg_2d = re.sub(r'<text(?![^>]*fill=)([^>]*?)>', r'<text fill="white"\1>', svg_2d) |
|
|
|
|
|
svg_2d = re.sub(r'fill="white"\s+fill="white"', 'fill="white"', svg_2d) |
|
|
|
|
|
svg_2d = re.sub(r'\bblack\b', 'white', svg_2d) |
|
svg_2d = re.sub(r'#000000', '#FFFFFF', svg_2d) |
|
svg_2d = re.sub(r'#000\b', '#FFF', svg_2d) |
|
svg_2d = re.sub(r'rgb\(0,\s*0,\s*0\)', 'rgb(255,255,255)', svg_2d) |
|
|
|
|
|
svg_2d = f'<div style="background-color: #1C1C1C; padding: 10px; border-radius: 5px;">{svg_2d}</div>' |
|
|
|
mol_3d = Chem.AddHs(mol) |
|
AllChem.EmbedMolecule(mol_3d, randomSeed=42) |
|
AllChem.MMFFOptimizeMolecule(mol_3d) |
|
sdf_data = Chem.MolToMolBlock(mol_3d) |
|
|
|
viewer = py3Dmol.view(width=400, height=300) |
|
viewer.setBackgroundColor('#1C1C1C') |
|
viewer.addModel(sdf_data, "sdf") |
|
viewer.setStyle({'stick': {}, 'sphere': {'scale': 0.25}}) |
|
viewer.zoomTo() |
|
html_3d = viewer._make_html() |
|
|
|
combined_html = f""" |
|
<div style="display: flex; flex-direction: row; align-items: center; justify-content: space-around; border: 1px solid #444; border-radius: 10px; padding: 10px; margin-bottom: 10px; background-color: #2b2b2b;"> |
|
<div style="text-align: center;"> |
|
<h4 style="color: white; font-family: 'Roboto', sans-serif;">{name} (2D Structure)</h4> |
|
{svg_2d} |
|
</div> |
|
<div style="text-align: center;"> |
|
<h4 style="color: white; font-family: 'Roboto', sans-serif;">{name} (3D Interactive)</h4> |
|
{html_3d} |
|
</div> |
|
</div> |
|
""" |
|
log += f"β
Generated 2D/3D view for {name}.\n" |
|
return combined_html, log |
|
except Exception as e: |
|
return f"<p>Error visualizing {name}: {e}</p>", f"β Error visualizing {name}: {e}" |
|
|
|
def visualize_protein_ligand_interaction(pdb_data: str, pdb_id: str, ligand_resn='G39'): |
|
"""Visualizes a protein-ligand binding site using py3Dmol.""" |
|
if not pdb_data: return None, "Cannot generate view: No PDB data provided." |
|
try: |
|
viewer = py3Dmol.view(width='100%', height=700) |
|
viewer.setBackgroundColor('#1C1C1C') |
|
viewer.addModel(pdb_data, "pdb") |
|
viewer.setStyle({'cartoon': {'color': 'spectrum', 'thickness': 0.8}}) |
|
viewer.addSurface(py3Dmol.VDW, {'opacity': 0.2, 'color': 'lightblue'}) |
|
viewer.addStyle({'resn': ligand_resn}, {'stick': {'colorscheme': 'greenCarbon', 'radius': 0.3}, 'sphere': {'scale': 0.4, 'colorscheme': 'greenCarbon'}}) |
|
viewer.addStyle({'within': {'distance': 4, 'sel': {'resn': ligand_resn}}}, {'stick': {'colorscheme': 'orangeCarbon', 'radius': 0.2}}) |
|
viewer.zoomTo({'resn': ligand_resn}) |
|
html = viewer._make_html() |
|
log = (f"β
Generated protein-ligand interaction view for PDB {pdb_id}.\n" |
|
f"π’ Green: Ligand ({ligand_resn})\n" |
|
f"π Orange: Residues within 4Γ
of ligand\n") |
|
return html, log |
|
except Exception as e: |
|
return None, f"β Protein-ligand visualization error: {e}" |
|
|
|
|
|
def get_phase3_molecules(): |
|
return { |
|
'Oseltamivir': 'CCC(CC)O[C@H]1[C@H]([C@@H]([C@H](C=C1C(=O)OCC)N)N)NC(=O)C', |
|
'Aspirin': 'CC(=O)OC1=CC=CC=C1C(=O)O', |
|
'Remdesivir': 'CCC(CC)COC(=O)[C@@H](C)N[P@](=O)(OC[C@@H]1O[C@](C#N)([C@H]([C@@H]1O)O)C2=CC=C3N2N=CN=C3N)OC4=CC=CC=C4', |
|
'Penicillin G': 'CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C' |
|
} |
|
|
|
def calculate_comprehensive_properties(smiles_dict: dict): |
|
analysis = [] |
|
log = "" |
|
for name, smiles in smiles_dict.items(): |
|
mol = Chem.MolFromSmiles(smiles) |
|
if not mol: continue |
|
mw, logp, hbd, hba = Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Descriptors.NumHDonors(mol), Descriptors.NumHAcceptors(mol) |
|
violations = sum([mw > 500, logp > 5, hbd > 5, hba > 10]) |
|
analysis.append({'Compound': name, 'Molecular_Weight': mw, 'LogP': logp, 'HBD': hbd, 'HBA': hba, |
|
'TPSA': Descriptors.TPSA(mol), 'Rotatable_Bonds': Descriptors.NumRotatableBonds(mol), |
|
'Aromatic_Rings': Descriptors.NumAromaticRings(mol), |
|
'Lipinski_Violations': violations, |
|
'Drug_Like': 'β
Yes' if violations <= 1 else 'β No'}) |
|
df = pd.DataFrame(analysis).round(2) |
|
log += f"β
Calculated comprehensive properties for {len(df)} compounds.\n" |
|
return df, log |
|
|
|
def predict_toxicity(properties_df: pd.DataFrame): |
|
if properties_df.empty: return pd.DataFrame(), "Cannot predict toxicity: No properties data." |
|
np.random.seed(42) |
|
n_compounds = 500 |
|
training_data = pd.DataFrame({'molecular_weight': np.random.normal(400, 100, n_compounds), |
|
'logp': np.random.normal(2.5, 1.5, n_compounds), |
|
'tpsa': np.random.normal(80, 30, n_compounds), |
|
'rotatable_bonds': np.random.randint(0, 15, n_compounds), |
|
'aromatic_rings': np.random.randint(0, 5, n_compounds)}) |
|
toxicity_score = ((training_data['molecular_weight'] > 550) * 0.4 + (abs(training_data['logp']) > 4.5) * 0.4 + np.random.random(n_compounds) * 0.2) |
|
training_data['toxic'] = (toxicity_score > 0.5).astype(int) |
|
features = ['molecular_weight', 'logp', 'tpsa', 'rotatable_bonds', 'aromatic_rings'] |
|
rf_model = RandomForestClassifier(n_estimators=50, random_state=42) |
|
rf_model.fit(training_data[features], training_data['toxic']) |
|
X_pred = properties_df[['Molecular_Weight', 'LogP', 'TPSA', 'Rotatable_Bonds', 'Aromatic_Rings']] |
|
X_pred.columns = features |
|
toxicity_prob = rf_model.predict_proba(X_pred)[:, 1] |
|
results_df = properties_df[['Compound']].copy() |
|
results_df['Toxicity_Probability'] = np.round(toxicity_prob, 3) |
|
results_df['Predicted_Risk'] = ["π’ LOW" if p < 0.3 else "π‘ MODERATE" if p < 0.7 else "π΄ HIGH" for p in toxicity_prob] |
|
return results_df, "β
Predicted toxicity using a pre-trained simulation model.\n" |
|
|
|
|
|
def get_regulatory_summary(): |
|
summary = {'Component': ['Data Governance', 'Model Architecture', 'Model Validation', 'Interpretability'], |
|
'Description': ['Data sourced from ChEMBL, PDB, GISAID. Bias assessed via geographic distribution analysis.', |
|
'Graph Convolutional Network (Target ID), Random Forest (ADMET), K-Means (Patient Stratification).', |
|
'ADMET Model validated with AUC-ROC > 0.85 on an independent test set.', |
|
'SHAP used for patient stratification model outputs.']} |
|
return pd.DataFrame(summary), "β
Generated AI/ML documentation summary." |
|
|
|
def simulate_rwd_analysis(adverse_event_text): |
|
np.random.seed(42) |
|
base_events = list(np.random.choice(['headache', 'nausea', 'fatigue', 'dizziness', 'rash', 'fever'], 100, p=[0.25, 0.2, 0.15, 0.15, 0.15, 0.1])) |
|
user_events = [e.strip().lower() for e in adverse_event_text.split(',') if e.strip()] |
|
all_events = base_events + user_events |
|
event_counts = pd.Series(all_events).value_counts() |
|
log = f"β
Analyzed {len(all_events)} simulated adverse event reports.\n" |
|
|
|
plt.style.use('dark_background') |
|
fig_bar, ax_bar = plt.subplots(figsize=(10, 6)) |
|
|
|
fig_bar.patch.set_facecolor('none') |
|
ax_bar.set_facecolor('none') |
|
|
|
sns.barplot(x=event_counts.values, y=event_counts.index, palette='viridis', ax=ax_bar, orient='h') |
|
ax_bar.set_title('Simulated Adverse Event Frequencies') |
|
ax_bar.set_xlabel('Number of Reports') |
|
ax_bar.set_ylabel('Adverse Event') |
|
|
|
plt.tight_layout() |
|
|
|
return event_counts.reset_index().rename(columns={'index': 'Event', 0: 'Count'}), fig_bar, log |
|
|
|
def get_ethical_framework(): |
|
framework = {'Pillar': ['1. Beneficence & Non-Maleficence', '2. Justice & Fairness', '3. Transparency & Explainability', '4. Accountability & Governance'], |
|
'Description': ['AI should help patients and do no harm. Requires rigorous validation and safety monitoring.', |
|
'AI must not create or worsen health disparities. Requires bias detection and mitigation.', |
|
'Clinical decisions influenced by AI must be understandable. Requires interpretable models.', |
|
'Clear lines of responsibility for AI systems must be established. Requires human oversight.']} |
|
return pd.DataFrame(framework), "β
Generated ethical framework summary." |
|
|
|
|
|
|
|
|
|
st.title("π¬ AI-Powered Drug Discovery Pipeline") |
|
st.markdown(""" |
|
Welcome to the AI Drug Discovery Pipeline Demonstrator. This application integrates the four major phases of drug development, |
|
showcasing how AI and computational tools can accelerate the process from target identification to post-market surveillance. |
|
Navigate through the tabs below to explore each phase. |
|
""") |
|
|
|
|
|
if 'log_p1' not in st.session_state: st.session_state.log_p1 = "Phase 1 logs will appear here." |
|
if 'results_p1' not in st.session_state: st.session_state.results_p1 = {} |
|
if 'log_p2' not in st.session_state: st.session_state.log_p2 = "Phase 2 logs will appear here." |
|
if 'results_p2' not in st.session_state: st.session_state.results_p2 = {} |
|
if 'log_p3' not in st.session_state: st.session_state.log_p3 = "Phase 3 logs will appear here." |
|
if 'results_p3' not in st.session_state: st.session_state.results_p3 = {} |
|
if 'log_p4' not in st.session_state: st.session_state.log_p4 = "Phase 4 logs will appear here." |
|
if 'results_p4' not in st.session_state: st.session_state.results_p4 = {} |
|
|
|
tab1, tab2, tab3, tab4 = st.tabs([ |
|
"Phase 1: Discovery & Target ID", |
|
"Phase 2: Lead Generation & Optimization", |
|
"Phase 3: Preclinical Development", |
|
"Phase 4: Implementation & Post-Market" |
|
]) |
|
|
|
|
|
with tab1: |
|
st.header("𧬠Step 1: Target Identification and Initial Analysis") |
|
st.markdown("Fetch protein data from public databases and perform a high-level analysis of potential drug-like molecules.") |
|
|
|
with st.form(key="phase1_form"): |
|
st.subheader("Analysis Controls") |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
pdb_id_input = st.text_input("Enter PDB ID", value="3B7E", key="p1_pdb") |
|
protein_id_input = st.text_input("Enter Protein ID (for FASTA)", value="ACF54602.1", key="p1_protein") |
|
with col2: |
|
smiles_input_p1 = st.text_area("Enter SMILES strings (one per line)", value="\n".join(create_sample_molecules()), height=150, key="p1_smiles") |
|
|
|
run_phase1_btn = st.form_submit_button("π Run Phase 1 Analysis", use_container_width=True) |
|
|
|
if run_phase1_btn: |
|
full_log = "--- Starting Phase 1 Analysis ---\n" |
|
pdb_data, log_pdb_fetch = fetch_pdb_structure(pdb_id_input) |
|
full_log += log_pdb_fetch |
|
fasta_log = fetch_fasta_sequence(protein_id_input) |
|
full_log += fasta_log |
|
protein_view_html, log_3d_viz = visualize_protein_3d(pdb_data, pdb_id_input) |
|
full_log += log_3d_viz |
|
smiles_list = [s.strip() for s in smiles_input_p1.split('\n') if s.strip()] |
|
props_df, log_props = calculate_molecular_properties(smiles_list) |
|
full_log += log_props |
|
|
|
analysis_df, display_df, log_lipinski = assess_drug_likeness(props_df) |
|
full_log += log_lipinski |
|
|
|
props_plot, log_plot = plot_properties_dashboard(analysis_df) |
|
full_log += log_plot |
|
full_log += "\n--- Phase 1 Analysis Complete ---" |
|
st.session_state.log_p1 = full_log |
|
|
|
lipinski_cols = ['Molecule', 'MW', 'LogP', 'HBD', 'HBA', 'Lipinski_Violations', 'Drug_Like'] |
|
lipinski_subset_df = display_df[lipinski_cols] if not display_df.empty else pd.DataFrame(columns=lipinski_cols) |
|
|
|
st.session_state.results_p1 = { |
|
'protein_view_html': protein_view_html, |
|
'fasta_log': fasta_log, |
|
'lipinski_subset_df': lipinski_subset_df, |
|
'props_df': props_df, |
|
'props_plot': props_plot |
|
} |
|
|
|
st.text_area("Status & Logs", st.session_state.log_p1, height=200, key="log_p1_area") |
|
|
|
if st.session_state.results_p1: |
|
res1 = st.session_state.results_p1 |
|
p1_tabs = st.tabs(["Protein Information", "Molecule Analysis", "Analysis Plots"]) |
|
with p1_tabs[0]: |
|
st.subheader("Protein 3D Structure (Interactive)") |
|
if res1.get('protein_view_html'): |
|
st.components.v1.html(res1['protein_view_html'], height=600, scrolling=False) |
|
st.subheader("FASTA Sequence Information") |
|
st.text_area("", res1.get('fasta_log', 'No data'), height=200, key="fasta_info_area") |
|
with p1_tabs[1]: |
|
st.subheader("Drug-Likeness Assessment (Lipinski's Rule of Five)") |
|
st.dataframe(res1.get('lipinski_subset_df', pd.DataFrame()), use_container_width=True, hide_index=True) |
|
st.subheader("Calculated Molecular Properties") |
|
st.dataframe(res1.get('props_df', pd.DataFrame()), use_container_width=True, hide_index=True) |
|
with p1_tabs[2]: |
|
st.subheader("Molecular Properties Dashboard") |
|
if res1.get('props_plot'): |
|
st.pyplot(res1['props_plot']) |
|
else: |
|
st.warning("Could not generate plots. Please check the logs for more details.") |
|
|
|
|
|
with tab2: |
|
st.header("π Step 2: Virtual Screening and ADMET Prediction") |
|
st.markdown("Screen candidate molecules against the target, predict their ADMET properties, and visualize the top candidates.") |
|
|
|
with st.form(key="phase2_form"): |
|
st.subheader("Analysis Controls") |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
phase2_pdb_id_input = st.text_input("Enter PDB ID for Interaction View", value="3B7E", key="p2_pdb") |
|
with col2: |
|
phase2_ligand_resn = st.text_input("Ligand Residue Name (in PDB)", value="G39", key="p2_ligand") |
|
run_phase2_btn = st.form_submit_button("π Run Phase 2 Analysis", use_container_width=True) |
|
|
|
if run_phase2_btn: |
|
full_log = "--- Starting Phase 2 Analysis ---\n" |
|
smiles_dict = get_phase2_molecules() |
|
screening_df, log_screening = simulate_virtual_screening(smiles_dict) |
|
full_log += log_screening |
|
admet_df, log_admet = predict_admet_properties(smiles_dict) |
|
full_log += log_admet |
|
combined_viz_html = "" |
|
for name, smiles in smiles_dict.items(): |
|
html_block, log_mol_viz = visualize_molecule_2d_3d(smiles, name) |
|
combined_viz_html += html_block |
|
full_log += log_mol_viz |
|
pdb_data, log_pdb_fetch_2 = fetch_pdb_structure(phase2_pdb_id_input) |
|
full_log += log_pdb_fetch_2 |
|
interaction_html, log_interaction = visualize_protein_ligand_interaction(pdb_data, phase2_pdb_id_input, phase2_ligand_resn) |
|
full_log += log_interaction |
|
full_log += "\n--- Phase 2 Analysis Complete ---" |
|
st.session_state.log_p2 = full_log |
|
st.session_state.results_p2 = { |
|
'screening_df': screening_df, |
|
'admet_df': admet_df, |
|
'combined_viz_html': combined_viz_html, |
|
'interaction_html': interaction_html |
|
} |
|
|
|
st.text_area("Status & Logs", st.session_state.log_p2, height=200, key="log_p2_area") |
|
|
|
if st.session_state.results_p2: |
|
res2 = st.session_state.results_p2 |
|
p2_tabs = st.tabs(["Virtual Screening & ADMET", "Molecule Visualization (2D & 3D)", "Protein-Ligand Interaction"]) |
|
with p2_tabs[0]: |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.subheader("Virtual Screening Results (Simulated)") |
|
st.dataframe(res2.get('screening_df', pd.DataFrame()), use_container_width=True, hide_index=True) |
|
with col2: |
|
st.subheader("ADMET Properties Prediction") |
|
st.dataframe(res2.get('admet_df', pd.DataFrame()), use_container_width=True, hide_index=True) |
|
with p2_tabs[1]: |
|
st.subheader("Interactive 2D and 3D views of candidate molecules.") |
|
if res2.get('combined_viz_html'): |
|
st.components.v1.html(res2.get('combined_viz_html'), height=700, scrolling=True) |
|
with p2_tabs[2]: |
|
st.subheader("Detailed view of the top candidate binding to the protein.") |
|
if res2.get('interaction_html'): |
|
st.components.v1.html(res2.get('interaction_html'), height=700, scrolling=False) |
|
|
|
|
|
|
|
with tab3: |
|
st.header("π§ͺ Step 3: In-Depth Candidate Analysis and Toxicity Prediction") |
|
st.markdown("Perform a comprehensive analysis of the most promising lead compounds and use a simulated AI model to predict toxicity risk.") |
|
|
|
with st.form(key="phase3_form"): |
|
st.subheader("Analysis Controls") |
|
run_phase3_btn = st.form_submit_button("π Run Phase 3 Analysis", use_container_width=True) |
|
|
|
if run_phase3_btn: |
|
full_log = "--- Starting Phase 3 Analysis ---\n" |
|
smiles_dict = get_phase3_molecules() |
|
comp_props_df, log_comp_props = calculate_comprehensive_properties(smiles_dict) |
|
full_log += log_comp_props |
|
tox_df, log_tox = predict_toxicity(comp_props_df) |
|
full_log += log_tox |
|
combined_viz_html = "" |
|
for name, smiles in smiles_dict.items(): |
|
html_block, log_mol_viz_p3 = visualize_molecule_2d_3d(smiles, name) |
|
combined_viz_html += html_block |
|
full_log += log_mol_viz_p3 |
|
full_log += "\n--- Phase 3 Analysis Complete ---" |
|
st.session_state.log_p3 = full_log |
|
st.session_state.results_p3 = { |
|
'comp_props_df': comp_props_df, |
|
'tox_df': tox_df, |
|
'combined_viz_html': combined_viz_html |
|
} |
|
|
|
st.text_area("Status & Logs", st.session_state.log_p3, height=200, key="log_p3_area") |
|
|
|
if st.session_state.results_p3: |
|
res3 = st.session_state.results_p3 |
|
p3_tabs = st.tabs(["Comprehensive Properties & Toxicity", "Molecule Visualization (3D Gallery)"]) |
|
with p3_tabs[0]: |
|
st.subheader("Comprehensive Molecular Properties & AI-Powered Toxicity Prediction (Simulated)") |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.dataframe(res3.get('comp_props_df', pd.DataFrame()), use_container_width=True, hide_index=True) |
|
with col2: |
|
st.dataframe(res3.get('tox_df', pd.DataFrame()), use_container_width=True, hide_index=True) |
|
with p3_tabs[1]: |
|
st.subheader("Interactive 3D gallery of the compounds under analysis.") |
|
if res3.get('combined_viz_html'): |
|
st.components.v1.html(res3.get('combined_viz_html'), height=1000, scrolling=True) |
|
|
|
|
|
|
|
with tab4: |
|
st.header("π Step 4: Regulatory Submission and Pharmacovigilance") |
|
st.markdown("Explore summaries of the documentation needed for regulatory approval and simulate how AI can monitor real-world data for adverse events.") |
|
|
|
with st.form(key="phase4_form"): |
|
st.subheader("Analysis Controls") |
|
rwd_input = st.text_area("Enter new adverse events (comma-separated)", value="severe allergic reaction, joint pain, severe allergic reaction", height=100, key="p4_rwd") |
|
run_phase4_btn = st.form_submit_button("π Run Phase 4 Analysis", use_container_width=True) |
|
|
|
if run_phase4_btn: |
|
full_log = "--- Starting Phase 4 Analysis ---\n" |
|
reg_df, log_reg = get_regulatory_summary() |
|
full_log += log_reg |
|
eth_df, log_eth = get_ethical_framework() |
|
full_log += log_eth |
|
|
|
rwd_df, plot_bar, log_rwd = simulate_rwd_analysis(rwd_input) |
|
full_log += log_rwd |
|
full_log += "\n--- Phase 4 Analysis Complete ---" |
|
st.session_state.log_p4 = full_log |
|
|
|
st.session_state.results_p4 = { |
|
'rwd_df': rwd_df, |
|
'plot_bar': plot_bar, |
|
'reg_df': reg_df, |
|
'eth_df': eth_df |
|
} |
|
|
|
st.text_area("Status & Logs", st.session_state.log_p4, height=200, key="log_p4_area") |
|
|
|
if st.session_state.results_p4: |
|
res4 = st.session_state.results_p4 |
|
p4_tabs = st.tabs(["Pharmacovigilance Analysis", "Regulatory & Ethical Frameworks"]) |
|
with p4_tabs[0]: |
|
st.subheader("Simulated Adverse Event Analysis") |
|
if res4.get('plot_bar'): |
|
st.pyplot(res4['plot_bar']) |
|
st.dataframe(res4.get('rwd_df', pd.DataFrame()), use_container_width=True, hide_index=True) |
|
|
|
with p4_tabs[1]: |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.subheader("AI/ML Documentation Summary for Submission") |
|
st.dataframe(res4.get('reg_df', pd.DataFrame()), use_container_width=True, hide_index=True) |
|
with col2: |
|
st.subheader("Ethical Framework for AI in Healthcare") |
|
st.dataframe(res4.get('eth_df', pd.DataFrame()), use_container_width=True, hide_index=True) |
|
|