alidenewade commited on
Commit
ed20d10
Β·
verified Β·
1 Parent(s): 1818804

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +361 -462
app.py CHANGED
@@ -1,12 +1,8 @@
1
- # app.py
2
  # AI-Powered Drug Discovery Pipeline Streamlit Application
3
  # This script integrates four phases of drug discovery into a single, interactive web app.
4
-
5
  import streamlit as st
6
  import pandas as pd
7
  import numpy as np
8
- import matplotlib.pyplot as plt
9
- import seaborn as sns
10
  import requests
11
  import io
12
  import re
@@ -41,16 +37,15 @@ st.set_page_config(
41
  page_title="AI Drug Discovery Pipeline",
42
  page_icon="πŸ”¬",
43
  layout="wide",
44
- initial_sidebar_state="collapsed", # Sidebar is removed, but this is good practice
45
  )
46
 
47
- # Custom CSS for a professional, minimalist look
48
  def apply_custom_styling():
49
  st.markdown(
50
  """
51
  <style>
52
  @import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap');
53
-
54
  html, body, [class*="st-"] {
55
  font-family: 'Roboto', sans-serif;
56
  }
@@ -92,6 +87,11 @@ def apply_custom_styling():
92
  color: #FFF;
93
  background-color: #00A0FF;
94
  }
 
 
 
 
 
95
  </style>
96
  """,
97
  unsafe_allow_html=True
@@ -189,7 +189,7 @@ def calculate_molecular_properties(smiles_dict: dict):
189
  mol = Chem.MolFromSmiles(smiles)
190
  if mol:
191
  props = {
192
- 'Molecule': name, # Use the provided name
193
  'SMILES': smiles,
194
  'MW': Descriptors.MolWt(mol),
195
  'LogP': Descriptors.MolLogP(mol),
@@ -214,7 +214,6 @@ def assess_drug_likeness(df: pd.DataFrame):
214
  if df.empty:
215
  return pd.DataFrame(), pd.DataFrame(), "Cannot assess drug-likeness: No properties data."
216
 
217
- # Create a copy for analysis to avoid modifying the original dataframe
218
  analysis_df = df.copy()
219
  analysis_df['MW_OK'] = analysis_df['MW'] <= 500
220
  analysis_df['LogP_OK'] = analysis_df['LogP'] <= 5
@@ -222,17 +221,14 @@ def assess_drug_likeness(df: pd.DataFrame):
222
  analysis_df['HBA_OK'] = analysis_df['HBA'] <= 10
223
  analysis_df['Lipinski_Violations'] = (~analysis_df[['MW_OK', 'LogP_OK', 'HBD_OK', 'HBA_OK']]).sum(axis=1)
224
 
225
- # This boolean column is for the plotting function
226
  analysis_df['Drug_Like'] = analysis_df['Lipinski_Violations'] <= 1
227
 
228
- # Create a separate DataFrame for display purposes with emojis
229
  display_df = df.copy()
230
  display_df['Lipinski_Violations'] = analysis_df['Lipinski_Violations']
231
  display_df['Drug_Like'] = analysis_df['Drug_Like'].apply(lambda x: 'βœ… Yes' if x else '❌ No')
232
 
233
  log = "βœ… Assessed drug-likeness using Lipinski's Rule of Five.\n"
234
 
235
- # Return both the analysis_df (for plotting) and display_df (for tables)
236
  return analysis_df, display_df, log
237
 
238
 
@@ -245,36 +241,22 @@ def plot_properties_dashboard(df: pd.DataFrame):
245
  if df['Drug_Like'].dtype != bool:
246
  return None, f"Cannot plot: 'Drug_Like' column must be boolean, but it is {df['Drug_Like'].dtype}."
247
 
248
- # Prepare data
249
  df['Category'] = df['Drug_Like'].apply(lambda x: 'Drug-Like' if x else 'Non-Drug-Like')
250
  source = ColumnDataSource(df)
251
 
252
- # Professional color palette
253
- colors = ['#00D4AA', '#FF6B6B'] # Teal for drug-like, coral for non-drug-like
254
  color_mapper = factor_cmap('Category', palette=colors, factors=["Drug-Like", "Non-Drug-Like"])
255
 
256
- # Enhanced hover tooltip for scatter plots
257
  scatter_hover = HoverTool(tooltips=[
258
- ("Compound", "@Molecule"),
259
- ("MW", "@MW{0.0} Da"),
260
- ("LogP", "@LogP{0.00}"),
261
- ("HBD", "@HBD"),
262
- ("HBA", "@HBA"),
263
- ("TPSA", "@TPSA{0.0} Γ…Β²"),
264
- ("Category", "@Category")
265
  ])
266
 
267
- # Common plot configuration - responsive plots with a 1:1 aspect ratio
268
  plot_config = {
269
- 'sizing_mode': 'scale_width',
270
- 'aspect_ratio': 1, # Enforce a square aspect ratio for the data area
271
- 'background_fill_color': None,
272
- 'border_fill_color': None,
273
- 'outline_line_color': '#333333',
274
- 'min_border_left': 50,
275
- 'min_border_right': 50,
276
- 'min_border_top': 50,
277
- 'min_border_bottom': 50
278
  }
279
 
280
  def style_plot(p, x_label, y_label, title):
@@ -300,146 +282,61 @@ def plot_properties_dashboard(df: pd.DataFrame):
300
  p.legend.background_fill_alpha = 0.8
301
  p.legend.border_line_color = '#444444'
302
  p.legend.label_text_color = '#FFFFFF'
303
- p.legend.label_text_font_size = '10pt'
304
  p.legend.click_policy = "mute"
305
- p.legend.glyph_height = 15
306
- p.legend.spacing = 5
307
-
308
  return p
309
 
310
- # Plot 1: MW vs LogP with Lipinski guidelines
311
  p1 = figure(title="Molecular Weight vs LogP", tools=[scatter_hover, 'pan,wheel_zoom,box_zoom,reset,save'], **plot_config)
312
  p1.scatter('MW', 'LogP', source=source, legend_group='Category',
313
  color=color_mapper, size=12, alpha=0.8, line_color='white', line_width=0.5)
314
-
315
- # Add Lipinski rule lines
316
- p1.line([500, 500], [df['LogP'].min()-0.5, df['LogP'].max()+0.5],
317
- line_dash="dashed", line_color="#FFD700", line_width=2, alpha=0.7, legend_label="MW ≀ 500")
318
- p1.line([df['MW'].min()-50, df['MW'].max()+50], [5, 5],
319
- line_dash="dashed", line_color="#FFD700", line_width=2, alpha=0.7, legend_label="LogP ≀ 5")
320
-
321
  style_plot(p1, "Molecular Weight (Da)", "LogP", "Lipinski Rule: MW vs LogP")
322
 
323
- # Plot 2: HBD vs HBA
324
  p2 = figure(title="Hydrogen Bonding Profile", tools=[scatter_hover, 'pan,wheel_zoom,box_zoom,reset,save'], **plot_config)
325
- p2.scatter('HBD', 'HBA', source=source, legend_group='Category',
326
- color=color_mapper, size=12, alpha=0.8, line_color='white', line_width=0.5)
327
-
328
- # Add Lipinski rule lines
329
- p2.line([5, 5], [df['HBA'].min()-1, df['HBA'].max()+1],
330
- line_dash="dashed", line_color="#FFD700", line_width=2, alpha=0.7, legend_label="HBD ≀ 5")
331
- p2.line([df['HBD'].min()-1, df['HBD'].max()+1], [10, 10],
332
- line_dash="dashed", line_color="#FFD700", line_width=2, alpha=0.7, legend_label="HBA ≀ 10")
333
-
334
  style_plot(p2, "Hydrogen Bond Donors", "Hydrogen Bond Acceptors", "Lipinski Rule: Hydrogen Bonding")
335
 
336
- # Plot 3: TPSA vs Rotatable Bonds with guidelines
337
  p3 = figure(title="Molecular Flexibility & Polarity", tools=[scatter_hover, 'pan,wheel_zoom,box_zoom,reset,save'], **plot_config)
338
- p3.scatter('TPSA', 'RotBonds', source=source, legend_group='Category',
339
- color=color_mapper, size=12, alpha=0.8, line_color='white', line_width=0.5)
340
-
341
- # Add permeability guideline lines
342
- p3.line([140, 140], [df['RotBonds'].min()-1, df['RotBonds'].max()+1],
343
- line_dash="dashed", line_color="#FFD700", line_width=2, alpha=0.7, legend_label="TPSA ≀ 140")
344
- p3.line([df['TPSA'].min()-10, df['TPSA'].max()+10], [10, 10],
345
- line_dash="dashed", line_color="#FFD700", line_width=2, alpha=0.7, legend_label="RotBonds ≀ 10")
346
-
347
  style_plot(p3, "Topological Polar Surface Area (Γ…Β²)", "Rotatable Bonds", "Drug Permeability Indicators")
348
-
349
- # Plot 4: Enhanced Donut Chart
350
- # --- MODIFICATION ---
351
- # Configure donut plot separately as it doesn't need all scatter tools
352
  p4_config = plot_config.copy()
353
- p4_config['tools'] = "hover" # Only need hover for the donut
354
  p4_config.update({'x_range': (-1.0, 1.0), 'y_range': (-1.0, 1.0)})
355
  p4 = figure(title="Drug-Likeness Distribution", **p4_config)
356
-
357
- # Calculate percentages and create donut chart
358
  counts = df['Category'].value_counts()
359
- total = counts.sum()
360
- data = pd.DataFrame({
361
- 'category': counts.index,
362
- 'value': counts.values,
363
- 'percentage': (counts.values / total * 100), # Keep full precision for hover
364
- 'angle': counts.values / total * 2 * pi,
365
- 'color': [colors[0] if cat == 'Drug-Like' else colors[1] for cat in counts.index]
366
- })
367
-
368
- # Calculate start and end angles for each wedge
369
- data['start_angle'] = 0
370
- data['end_angle'] = 0
371
- cumulative_angle = 0
372
- for i in range(len(data)):
373
- data.iloc[i, data.columns.get_loc('start_angle')] = cumulative_angle
374
- cumulative_angle += data.iloc[i]['angle']
375
- data.iloc[i, data.columns.get_loc('end_angle')] = cumulative_angle
376
-
377
- donut_source = ColumnDataSource(data)
378
-
379
- # Create donut using annular wedges (outer ring) - sized to fit within boundaries
380
  wedge_renderer = p4.annular_wedge(x=0, y=0, inner_radius=0.25, outer_radius=0.45,
381
- start_angle='start_angle', end_angle='end_angle',
382
  line_color="white", line_width=3, fill_color='color',
383
- legend_field='category', source=donut_source)
384
-
385
- # Add percentage text to each slice
386
- for i, row in data.iterrows():
387
- # Calculate middle angle for text positioning
388
- mid_angle = (row['start_angle'] + row['end_angle']) / 2
389
- # Position text at middle radius of the annular wedge
390
- text_radius = 0.35
391
- x_pos = text_radius * cos(mid_angle)
392
- y_pos = text_radius * sin(mid_angle)
393
-
394
- p4.text([x_pos], [y_pos], text=[f"{row['percentage']:.1f}%"],
395
- text_align="center", text_baseline="middle",
396
- text_color="white", text_font_size="11pt", text_font_style="bold")
397
-
398
- # Add center text
399
- p4.text([0], [0], text=[f"{len(df)}\nCompounds"],
400
- text_align="center", text_baseline="middle",
401
- text_color="white", text_font_size="14pt", text_font_style="bold")
402
-
403
- # --- MODIFICATION ---
404
- # Custom hover for donut with detailed info
405
- donut_hover = HoverTool(
406
- tooltips=[
407
- ("Category", "@category"),
408
- ("Count", "@value"),
409
- ("Percentage", "@percentage{0.1f}%")
410
- ],
411
- renderers=[wedge_renderer] # Attach hover tool specifically to the wedge glyph
412
- )
413
  p4.add_tools(donut_hover)
414
-
415
  style_plot(p4, "", "", "Compound Classification")
416
  p4.axis.visible = False
417
  p4.grid.visible = False
418
 
419
- # Create responsive grid layout
420
- grid = gridplot([[p1, p2], [p3, p4]], sizing_mode='scale_width',
421
- toolbar_location='right', merge_tools=True)
422
-
423
  return grid, "βœ… Generated enhanced molecular properties dashboard."
424
 
425
  # ===== Phase 2 Functions =====
426
  def get_phase2_molecules():
427
  """Returns an expanded list of common drugs with corrected SMILES."""
428
  return {
429
- 'Paracetamol': 'CC(=O)Nc1ccc(O)cc1',
430
- 'Ibuprofen': 'CC(C)Cc1ccc(C(C)C(=O)O)cc1',
431
- 'Aspirin': 'CC(=O)Oc1ccccc1C(=O)O',
432
- 'Naproxen': 'C[C@H](C(=O)O)c1ccc2cc(OC)ccc2c1',
433
- 'Diazepam': 'CN1C(=O)CN=C(c2ccccc2)c2cc(Cl)ccc12',
434
- 'Metformin': 'CN(C)C(=N)N=C(N)N',
435
- 'Loratadine': 'CCOC(=O)N1CCC(C(c2ccc(Cl)cc2)c2ccccn2)CC1',
436
- 'Morphine': 'C[N@]1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](O)C=C[C@H]3[C@H]1C5',
437
- 'Cetirizine': 'O=C(O)COCCOc1ccc(cc1)C(c1ccccc1)N1CCN(CC1)CCO',
438
- 'Fluoxetine': 'CNCCC(c1ccccc1)Oc1ccc(C(F)(F)F)cc1',
439
- 'Amoxicillin': 'C[C@@]1([C@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)[C@@H](N)c3ccc(O)cc3)C(=O)O)C',
440
- 'Atorvastatin': 'CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(c1)c1ccc(F)cc1',
441
- 'Ciprofloxacin': 'O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc12',
442
- 'Warfarin': 'O=C(c1ccccc1)C(c1oc2ccccc2c1=O)C',
443
  'Furosemide': 'O=C(O)c1cc(Cl)c(NC2CO2)c(c1)S(=O)(=O)N',
444
  }
445
 
@@ -468,8 +365,6 @@ def predict_admet_properties(smiles_dict: dict):
468
  log += f"βœ… Predicted ADMET properties for {len(df)} molecules.\n"
469
  return df, log
470
 
471
- # --- MODIFIED FUNCTION ---
472
- # This is the updated function to correctly render 2D molecules on a dark background.
473
  def visualize_molecule_2d_3d(smiles: str, name: str):
474
  """Generates a side-by-side 2D SVG and 3D py3Dmol HTML view for a single molecule."""
475
  log = ""
@@ -575,27 +470,41 @@ def visualize_molecule_2d_3d(smiles: str, name: str):
575
  return combined_html, log
576
  except Exception as e:
577
  return f"<p>Error visualizing {name}: {e}</p>", f"❌ Error visualizing {name}: {e}"
578
-
579
- def visualize_protein_ligand_interaction(pdb_data: str, pdb_id: str, ligand_resn='G39'):
580
- """Visualizes a protein-ligand binding site using py3Dmol."""
581
- if not pdb_data: return None, "Cannot generate view: No PDB data provided."
 
 
 
 
582
  try:
583
- viewer = py3Dmol.view(width='100%', height=700)
584
  viewer.setBackgroundColor('#1C1C1C')
 
 
585
  viewer.addModel(pdb_data, "pdb")
586
- viewer.setStyle({'cartoon': {'color': 'spectrum', 'thickness': 0.8}})
587
- viewer.addSurface(py3Dmol.VDW, {'opacity': 0.2, 'color': 'lightblue'})
588
- viewer.addStyle({'resn': ligand_resn}, {'stick': {'colorscheme': 'greenCarbon', 'radius': 0.3}, 'sphere': {'scale': 0.4, 'colorscheme': 'greenCarbon'}})
589
- viewer.addStyle({'within': {'distance': 4, 'sel': {'resn': ligand_resn}}}, {'stick': {'colorscheme': 'orangeCarbon', 'radius': 0.2}})
590
- viewer.zoomTo({'resn': ligand_resn})
 
 
 
 
 
 
 
 
 
591
  html = viewer._make_html()
592
- log = (f"βœ… Generated protein-ligand interaction view for PDB {pdb_id}.\n"
593
- f"🟒 Green: Ligand ({ligand_resn})\n"
594
- f"🟠 Orange: Residues within 4Γ… of ligand\n")
595
  return html, log
 
596
  except Exception as e:
597
- return None, f"❌ Protein-ligand visualization error: {e}"
598
-
599
  # ===== Phase 3 Functions =====
600
  def get_phase3_molecules():
601
  return {
@@ -654,368 +563,358 @@ def get_regulatory_summary():
654
  return pd.DataFrame(summary), "βœ… Generated AI/ML documentation summary."
655
 
656
  def simulate_rwd_analysis(adverse_event_text):
 
 
 
657
  np.random.seed(42)
658
- base_events = list(np.random.choice(['headache', 'nausea', 'fatigue', 'dizziness', 'rash', 'fever'], 100, p=[0.25, 0.2, 0.15, 0.15, 0.15, 0.1]))
659
- user_events = [e.strip().lower() for e in adverse_event_text.split(',') if e.strip()]
660
- all_events = base_events + user_events
661
- event_counts = pd.Series(all_events).value_counts()
662
- log = f"βœ… Analyzed {len(all_events)} simulated adverse event reports.\n"
 
 
663
 
664
- plt.style.use('dark_background')
665
- fig_bar, ax_bar = plt.subplots(figsize=(10, 6))
666
 
667
- fig_bar.patch.set_facecolor('none')
668
- ax_bar.set_facecolor('none')
669
 
670
- sns.barplot(x=event_counts.values, y=event_counts.index, palette='viridis', ax=ax_bar, orient='h')
671
- ax_bar.set_title('Simulated Adverse Event Frequencies')
672
- ax_bar.set_xlabel('Number of Reports')
673
- ax_bar.set_ylabel('Adverse Event')
674
 
675
- plt.tight_layout()
676
 
677
- return event_counts.reset_index().rename(columns={'index': 'Event', 0: 'Count'}), fig_bar, log
 
 
678
 
679
- def get_ethical_framework():
680
- framework = {'Pillar': ['1. Beneficence & Non-Maleficence', '2. Justice & Fairness', '3. Transparency & Explainability', '4. Accountability & Governance'],
681
- 'Description': ['AI should help patients and do no harm. Requires rigorous validation and safety monitoring.',
682
- 'AI must not create or worsen health disparities. Requires bias detection and mitigation.',
683
- 'Clinical decisions influenced by AI must be understandable. Requires interpretable models.',
684
- 'Clear lines of responsibility for AI systems must be established. Requires human oversight.']}
685
- return pd.DataFrame(framework), "βœ… Generated ethical framework summary."
686
 
 
 
 
 
 
687
 
688
- # --- 3. Streamlit Interface Definition ---
 
 
 
689
 
690
- st.title("πŸ”¬ AI-Powered Drug Discovery Pipeline")
691
- st.markdown("""
692
- Welcome to the AI Drug Discovery Pipeline Demonstrator. This application integrates the four major phases of drug development,
693
- showcasing how AI and computational tools can accelerate the process from target identification to post-market surveillance.
694
- Navigate through the tabs below to explore each phase.
695
- """)
696
-
697
- # Initialize session state for logs and results
698
- if 'log_p1' not in st.session_state: st.session_state.log_p1 = "Phase 1 logs will appear here."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699
  if 'results_p1' not in st.session_state: st.session_state.results_p1 = {}
700
- if 'log_p2' not in st.session_state: st.session_state.log_p2 = "Phase 2 logs will appear here."
701
  if 'results_p2' not in st.session_state: st.session_state.results_p2 = {}
702
- if 'log_p3' not in st.session_state: st.session_state.log_p3 = "Phase 3 logs will appear here."
703
  if 'results_p3' not in st.session_state: st.session_state.results_p3 = {}
704
- if 'log_p4' not in st.session_state: st.session_state.log_p4 = "Phase 4 logs will appear here."
705
  if 'results_p4' not in st.session_state: st.session_state.results_p4 = {}
706
 
 
 
 
 
 
707
  tab1, tab2, tab3, tab4 = st.tabs([
708
- "Phase 1: Discovery & Target ID",
709
- "Phase 2: Lead Generation & Optimization",
710
- "Phase 3: Preclinical Development",
711
- "Phase 4: Implementation & Post-Market"
712
  ])
713
 
714
- # ===== TAB 1: DISCOVERY & TARGET IDENTIFICATION =====
715
  with tab1:
716
- st.header("🧬 Step 1: Target Identification and Initial Analysis")
717
- st.markdown("Fetch protein data from public databases and perform a high-level analysis of potential drug-like molecules.")
718
-
719
- with st.form(key="phase1_form"):
720
- st.subheader("Analysis Controls")
721
- col1, col2 = st.columns(2)
722
- with col1:
723
- pdb_id_input = st.text_input("Enter PDB ID", value="3B7E", key="p1_pdb")
724
- protein_id_input = st.text_input("Enter Protein ID (for FASTA)", value="ACF54602.1", key="p1_protein")
725
- with col2:
726
- default_molecules_p1 = create_sample_molecules()
727
- default_molecules_text_p1 = "\n".join([f"{name}:{smiles}" for name, smiles in default_molecules_p1.items()])
728
- molecules_input_p1 = st.text_area(
729
- "Molecules (Name:SMILES, one per line)",
730
- value=default_molecules_text_p1,
731
- height=150,
732
- key="p1_molecules"
733
- )
734
-
735
- run_phase1_btn = st.form_submit_button("πŸš€ Run Phase 1 Analysis", use_container_width=True)
736
-
737
- if run_phase1_btn:
738
- full_log = "--- Starting Phase 1 Analysis ---\n"
739
-
740
- # Parse molecules from the text area
741
- smiles_dict_p1 = {}
742
- if molecules_input_p1.strip():
743
- try:
744
- for line in molecules_input_p1.strip().split('\n'):
745
- cleaned_line = line.replace('\xa0', ' ').strip()
746
- if ':' in cleaned_line:
747
- name, smiles = cleaned_line.split(':', 1)
748
- smiles_dict_p1[name.strip()] = smiles.strip()
749
- if smiles_dict_p1:
750
- full_log += f"βœ… Successfully parsed {len(smiles_dict_p1)} molecules from input.\n"
751
- else:
752
- full_log += "⚠️ Could not parse any molecules. Please check the format (e.g., 'Aspirin:CC...').\n"
753
- except Exception as e:
754
- full_log += f"❌ Error parsing molecules list: {e}\n"
755
- smiles_dict_p1 = {}
756
- else:
757
- full_log += "⚠️ Molecule input is empty. No analysis to perform.\n"
758
-
759
- if smiles_dict_p1:
760
- pdb_data, log_pdb_fetch = fetch_pdb_structure(pdb_id_input)
761
- full_log += log_pdb_fetch
762
- fasta_log = fetch_fasta_sequence(protein_id_input)
763
- full_log += fasta_log
764
- protein_view_html, log_3d_viz = visualize_protein_3d(pdb_data, pdb_id_input)
765
- full_log += log_3d_viz
766
 
767
- props_df, log_props = calculate_molecular_properties(smiles_dict_p1)
 
768
  full_log += log_props
769
 
770
- analysis_df, display_df, log_lipinski = assess_drug_likeness(props_df)
771
- full_log += log_lipinski
772
 
773
- props_plot, log_plot = plot_properties_dashboard(analysis_df) # This now calls the Bokeh function
774
- full_log += log_plot
 
 
 
 
 
 
775
 
776
- lipinski_cols = ['Molecule', 'MW', 'LogP', 'HBD', 'HBA', 'Lipinski_Violations', 'Drug_Like']
777
- lipinski_subset_df = display_df[lipinski_cols] if not display_df.empty else pd.DataFrame(columns=lipinski_cols)
778
-
779
  st.session_state.results_p1 = {
780
- 'protein_view_html': protein_view_html,
781
- 'fasta_log': fasta_log,
782
- 'lipinski_subset_df': lipinski_subset_df,
783
- 'props_df': props_df,
784
- 'props_plot': props_plot
785
  }
786
- else:
787
- st.session_state.results_p1 = {}
788
-
789
- full_log += "\n--- Phase 1 Analysis Complete ---"
790
- st.session_state.log_p1 = full_log
791
 
792
  st.text_area("Status & Logs", st.session_state.log_p1, height=200, key="log_p1_area")
793
-
794
- if st.session_state.results_p1:
 
 
 
795
  res1 = st.session_state.results_p1
796
- p1_tabs = st.tabs(["Analysis Plots", "Molecule Analysis", "Protein Information"])
 
797
  with p1_tabs[0]:
798
- st.subheader("Molecular Properties Dashboard")
799
- if res1.get('props_plot'):
800
- # Use st.bokeh_chart for Bokeh figures
801
- st.bokeh_chart(res1['props_plot'], use_container_width=True)
802
  else:
803
- st.warning("Could not generate plots. Please check the logs for more details.")
 
804
  with p1_tabs[1]:
805
- st.subheader("Drug-Likeness Assessment (Lipinski's Rule of Five)")
806
- st.dataframe(res1.get('lipinski_subset_df', pd.DataFrame()), use_container_width=True, hide_index=True)
807
- st.subheader("Calculated Molecular Properties")
808
- st.dataframe(res1.get('props_df', pd.DataFrame()), use_container_width=True, hide_index=True)
809
- with p1_tabs[2]:
810
- st.subheader("Protein 3D Structure (Interactive)")
811
- if res1.get('protein_view_html'):
812
- st.components.v1.html(res1['protein_view_html'], height=600, scrolling=False)
813
- st.subheader("FASTA Sequence Information")
814
- st.text_area("", res1.get('fasta_log', 'No data'), height=200, key="fasta_info_area")
815
-
816
- # ===== TAB 2: LEAD GENERATION & OPTIMIZATION =====
817
  with tab2:
818
- st.header("πŸ’Š Step 2: Virtual Screening and ADMET Prediction")
819
- st.markdown("Screen candidate molecules against the target, predict their ADMET properties, and visualize the top candidates.")
820
-
821
- with st.form(key="phase2_form"):
822
- st.subheader("Analysis Controls")
823
- col1, col2 = st.columns(2)
824
- with col1:
825
- phase2_pdb_id_input = st.text_input("Enter PDB ID for Interaction View", value="3B7E", key="p2_pdb")
826
- phase2_ligand_resn = st.text_input("Ligand Residue Name (in PDB)", value="G39", key="p2_ligand")
827
- with col2:
828
- default_molecules_dict = get_phase2_molecules()
829
- default_molecules_text = "\n".join([f"{name}:{smiles}" for name, smiles in default_molecules_dict.items()])
 
 
 
 
 
 
830
 
831
- molecules_input = st.text_area(
832
- "Molecules (Name:SMILES, one per line)",
833
- value=default_molecules_text,
834
- height=250,
835
- key="p2_molecules"
836
- )
837
-
838
- run_phase2_btn = st.form_submit_button("πŸš€ Run Phase 2 Analysis", use_container_width=True)
839
-
840
- if run_phase2_btn:
841
- full_log = "--- Starting Phase 2 Analysis ---\n"
842
-
843
- smiles_dict = {}
844
- if molecules_input.strip():
845
- try:
846
- for line in molecules_input.strip().split('\n'):
847
- cleaned_line = line.replace('\xa0', ' ').strip()
848
- if ':' in cleaned_line:
849
- name, smiles = cleaned_line.split(':', 1)
850
- smiles_dict[name.strip()] = smiles.strip()
851
- if smiles_dict:
852
- full_log += f"βœ… Successfully parsed {len(smiles_dict)} molecules from input.\n"
853
- else:
854
- full_log += "⚠️ Could not parse any molecules. Please check the format (e.g., 'Aspirin:CC(=O)OC1=CC=CC=C1C(=O)O').\n"
855
- except Exception as e:
856
- full_log += f"❌ Error parsing molecules list: {e}\n"
857
- smiles_dict = {}
858
- else:
859
- full_log += "⚠️ Molecule input is empty. No analysis to perform.\n"
860
-
861
- if smiles_dict:
862
- screening_df, log_screening = simulate_virtual_screening(smiles_dict)
863
- full_log += log_screening
864
- admet_df, log_admet = predict_admet_properties(smiles_dict)
865
  full_log += log_admet
866
 
867
- combined_viz_html = ""
868
- log_viz = ""
869
- for name, smiles in smiles_dict.items():
870
- html_block, log_mol_viz = visualize_molecule_2d_3d(smiles, name)
871
- combined_viz_html += html_block
872
- log_viz += log_mol_viz
873
- full_log += log_viz
874
-
875
- pdb_data, log_pdb_fetch_2 = fetch_pdb_structure(phase2_pdb_id_input)
876
- full_log += log_pdb_fetch_2
877
- interaction_html, log_interaction = visualize_protein_ligand_interaction(pdb_data, phase2_pdb_id_input, phase2_ligand_resn)
878
- full_log += log_interaction
879
 
 
 
880
  st.session_state.results_p2 = {
881
- 'screening_df': screening_df,
882
- 'admet_df': admet_df,
883
- 'combined_viz_html': combined_viz_html,
884
- 'interaction_html': interaction_html,
885
- 'molecules_used': smiles_dict
886
  }
887
- else:
888
- st.session_state.results_p2 = {}
889
-
890
- full_log += "\n--- Phase 2 Analysis Complete ---"
891
- st.session_state.log_p2 = full_log
892
-
893
  st.text_area("Status & Logs", st.session_state.log_p2, height=200, key="log_p2_area")
894
 
895
- if st.session_state.results_p2:
 
 
 
896
  res2 = st.session_state.results_p2
897
- p2_tabs = st.tabs(["Virtual Screening & ADMET", "Molecule Visualization (2D & 3D)", "Protein-Ligand Interaction"])
 
898
  with p2_tabs[0]:
899
- col1, col2 = st.columns(2)
900
- with col1:
901
- st.subheader("Virtual Screening Results (Simulated)")
902
- st.dataframe(res2.get('screening_df', pd.DataFrame()), use_container_width=True, hide_index=True)
903
- with col2:
904
- st.subheader("ADMET Properties Prediction")
905
- st.dataframe(res2.get('admet_df', pd.DataFrame()), use_container_width=True, hide_index=True)
906
  with p2_tabs[1]:
907
- molecules_used = res2.get('molecules_used', {})
908
- if molecules_used:
909
- st.subheader(f"Interactive 2D and 3D views of {len(molecules_used)} candidate molecules")
910
- st.info(f"Currently visualizing: {', '.join(molecules_used.keys())}")
911
  else:
912
- st.subheader("Interactive 2D and 3D views of candidate molecules")
913
-
914
- if res2.get('combined_viz_html'):
915
- st.components.v1.html(res2.get('combined_viz_html'), height=len(molecules_used) * 400 + 100, scrolling=True)
916
- else:
917
- st.warning("No molecule visualizations available. Please run the analysis first.")
918
- with p2_tabs[2]:
919
- st.subheader("Detailed view of the top candidate binding to the protein.")
920
- if res2.get('interaction_html'):
921
- st.components.v1.html(res2.get('interaction_html'), height=700, scrolling=False)
922
- else:
923
- st.warning("No protein-ligand interaction view available. Please run the analysis first.")
924
 
925
- # ===== TAB 3: PRECLINICAL DEVELOPMENT =====
926
  with tab3:
927
- st.header("πŸ§ͺ Step 3: In-Depth Candidate Analysis and Toxicity Prediction")
928
- st.markdown("Perform a comprehensive analysis of the most promising lead compounds and use a simulated AI model to predict toxicity risk.")
929
-
930
- with st.form(key="phase3_form"):
931
- st.subheader("Analysis Controls")
932
- run_phase3_btn = st.form_submit_button("πŸš€ Run Phase 3 Analysis", use_container_width=True)
933
-
934
- if run_phase3_btn:
935
- full_log = "--- Starting Phase 3 Analysis ---\n"
936
- smiles_dict = get_phase3_molecules()
937
- comp_props_df, log_comp_props = calculate_comprehensive_properties(smiles_dict)
938
- full_log += log_comp_props
939
- tox_df, log_tox = predict_toxicity(comp_props_df)
940
- full_log += log_tox
941
- combined_viz_html = ""
942
- log_viz_p3 = ""
943
- for name, smiles in smiles_dict.items():
944
- html_block, log_mol_viz_p3 = visualize_molecule_2d_3d(smiles, name)
945
- combined_viz_html += html_block
946
- log_viz_p3 += log_mol_viz_p3
947
- full_log += log_viz_p3
948
- full_log += "\n--- Phase 3 Analysis Complete ---"
949
- st.session_state.log_p3 = full_log
950
- st.session_state.results_p3 = {
951
- 'comp_props_df': comp_props_df,
952
- 'tox_df': tox_df,
953
- 'combined_viz_html': combined_viz_html
954
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
955
 
956
  st.text_area("Status & Logs", st.session_state.log_p3, height=200, key="log_p3_area")
957
 
958
- if st.session_state.results_p3:
 
 
 
959
  res3 = st.session_state.results_p3
960
- p3_tabs = st.tabs(["Comprehensive Properties & Toxicity", "Molecule Visualization (3D Gallery)"])
961
- with p3_tabs[0]:
962
- st.subheader("Comprehensive Molecular Properties & AI-Powered Toxicity Prediction (Simulated)")
963
- col1, col2 = st.columns(2)
964
- with col1:
965
- st.dataframe(res3.get('comp_props_df', pd.DataFrame()), use_container_width=True, hide_index=True)
966
- with col2:
967
- st.dataframe(res3.get('tox_df', pd.DataFrame()), use_container_width=True, hide_index=True)
968
- with p3_tabs[1]:
969
- st.subheader("Interactive 3D gallery of the compounds under analysis.")
970
- if res3.get('combined_viz_html'):
971
- st.components.v1.html(res3.get('combined_viz_html'), height=1000, scrolling=True)
972
-
973
-
974
- # ===== TAB 4: POST-MARKET SURVEILLANCE =====
975
- with tab4:
976
- st.header("πŸ“ˆ Step 4: Regulatory Submission and Pharmacovigilance")
977
- st.markdown("Explore summaries of the documentation needed for regulatory approval and simulate how AI can monitor real-world data for adverse events.")
978
-
979
- with st.form(key="phase4_form"):
980
- st.subheader("Analysis Controls")
981
- rwd_input = st.text_area("Enter new adverse events (comma-separated)", value="severe allergic reaction, joint pain, severe allergic reaction", height=100, key="p4_rwd")
982
- run_phase4_btn = st.form_submit_button("πŸš€ Run Phase 4 Analysis", use_container_width=True)
983
-
984
- if run_phase4_btn:
985
- full_log = "--- Starting Phase 4 Analysis ---\n"
986
- reg_df, log_reg = get_regulatory_summary()
987
- full_log += log_reg
988
- eth_df, log_eth = get_ethical_framework()
989
- full_log += log_eth
990
-
991
- rwd_df, plot_bar, log_rwd = simulate_rwd_analysis(rwd_input)
992
- full_log += log_rwd
993
- full_log += "\n--- Phase 4 Analysis Complete ---"
994
- st.session_state.log_p4 = full_log
995
 
996
- st.session_state.results_p4 = {
997
- 'rwd_df': rwd_df,
998
- 'plot_bar': plot_bar,
999
- 'reg_df': reg_df,
1000
- 'eth_df': eth_df
1001
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1002
 
1003
  st.text_area("Status & Logs", st.session_state.log_p4, height=200, key="log_p4_area")
1004
 
1005
- if st.session_state.results_p4:
 
 
 
1006
  res4 = st.session_state.results_p4
1007
  p4_tabs = st.tabs(["Pharmacovigilance Analysis", "Regulatory & Ethical Frameworks"])
 
1008
  with p4_tabs[0]:
1009
  st.subheader("Simulated Adverse Event Analysis")
1010
  if res4.get('plot_bar'):
1011
- st.pyplot(res4['plot_bar'])
1012
  st.dataframe(res4.get('rwd_df', pd.DataFrame()), use_container_width=True, hide_index=True)
1013
 
1014
  with p4_tabs[1]:
1015
- col1, col2 = st.columns(2)
1016
- with col1:
1017
- st.subheader("AI/ML Documentation Summary for Submission")
1018
- st.dataframe(res4.get('reg_df', pd.DataFrame()), use_container_width=True, hide_index=True)
1019
- with col2:
1020
- st.subheader("Ethical Framework for AI in Healthcare")
1021
- st.dataframe(res4.get('eth_df', pd.DataFrame()), use_container_width=True, hide_index=True)
 
 
1
  # AI-Powered Drug Discovery Pipeline Streamlit Application
2
  # This script integrates four phases of drug discovery into a single, interactive web app.
 
3
  import streamlit as st
4
  import pandas as pd
5
  import numpy as np
 
 
6
  import requests
7
  import io
8
  import re
 
37
  page_title="AI Drug Discovery Pipeline",
38
  page_icon="πŸ”¬",
39
  layout="wide",
40
+ initial_sidebar_state="collapsed",
41
  )
42
 
43
+ # Custom CSS for a professional, dark theme
44
  def apply_custom_styling():
45
  st.markdown(
46
  """
47
  <style>
48
  @import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap');
 
49
  html, body, [class*="st-"] {
50
  font-family: 'Roboto', sans-serif;
51
  }
 
87
  color: #FFF;
88
  background-color: #00A0FF;
89
  }
90
+
91
+ /* Ensure headers are white */
92
+ h1, h2, h3, h4, h5, h6 {
93
+ color: white !important;
94
+ }
95
  </style>
96
  """,
97
  unsafe_allow_html=True
 
189
  mol = Chem.MolFromSmiles(smiles)
190
  if mol:
191
  props = {
192
+ 'Molecule': name,
193
  'SMILES': smiles,
194
  'MW': Descriptors.MolWt(mol),
195
  'LogP': Descriptors.MolLogP(mol),
 
214
  if df.empty:
215
  return pd.DataFrame(), pd.DataFrame(), "Cannot assess drug-likeness: No properties data."
216
 
 
217
  analysis_df = df.copy()
218
  analysis_df['MW_OK'] = analysis_df['MW'] <= 500
219
  analysis_df['LogP_OK'] = analysis_df['LogP'] <= 5
 
221
  analysis_df['HBA_OK'] = analysis_df['HBA'] <= 10
222
  analysis_df['Lipinski_Violations'] = (~analysis_df[['MW_OK', 'LogP_OK', 'HBD_OK', 'HBA_OK']]).sum(axis=1)
223
 
 
224
  analysis_df['Drug_Like'] = analysis_df['Lipinski_Violations'] <= 1
225
 
 
226
  display_df = df.copy()
227
  display_df['Lipinski_Violations'] = analysis_df['Lipinski_Violations']
228
  display_df['Drug_Like'] = analysis_df['Drug_Like'].apply(lambda x: 'βœ… Yes' if x else '❌ No')
229
 
230
  log = "βœ… Assessed drug-likeness using Lipinski's Rule of Five.\n"
231
 
 
232
  return analysis_df, display_df, log
233
 
234
 
 
241
  if df['Drug_Like'].dtype != bool:
242
  return None, f"Cannot plot: 'Drug_Like' column must be boolean, but it is {df['Drug_Like'].dtype}."
243
 
 
244
  df['Category'] = df['Drug_Like'].apply(lambda x: 'Drug-Like' if x else 'Non-Drug-Like')
245
  source = ColumnDataSource(df)
246
 
247
+ colors = ['#00D4AA', '#FF6B6B']
 
248
  color_mapper = factor_cmap('Category', palette=colors, factors=["Drug-Like", "Non-Drug-Like"])
249
 
 
250
  scatter_hover = HoverTool(tooltips=[
251
+ ("Compound", "@Molecule"), ("MW", "@MW{0.0} Da"), ("LogP", "@LogP{0.00}"),
252
+ ("HBD", "@HBD"), ("HBA", "@HBA"), ("TPSA", "@TPSA{0.0} Γ…Β²"), ("Category", "@Category")
 
 
 
 
 
253
  ])
254
 
 
255
  plot_config = {
256
+ 'sizing_mode': 'scale_width', 'aspect_ratio': 1,
257
+ 'background_fill_color': None, 'border_fill_color': None,
258
+ 'outline_line_color': '#333333', 'min_border_left': 50,
259
+ 'min_border_right': 50, 'min_border_top': 50, 'min_border_bottom': 50
 
 
 
 
 
260
  }
261
 
262
  def style_plot(p, x_label, y_label, title):
 
282
  p.legend.background_fill_alpha = 0.8
283
  p.legend.border_line_color = '#444444'
284
  p.legend.label_text_color = '#FFFFFF'
 
285
  p.legend.click_policy = "mute"
 
 
 
286
  return p
287
 
 
288
  p1 = figure(title="Molecular Weight vs LogP", tools=[scatter_hover, 'pan,wheel_zoom,box_zoom,reset,save'], **plot_config)
289
  p1.scatter('MW', 'LogP', source=source, legend_group='Category',
290
  color=color_mapper, size=12, alpha=0.8, line_color='white', line_width=0.5)
291
+ p1.line([500, 500], [df['LogP'].min()-0.5, df['LogP'].max()+0.5], line_dash="dashed", line_color="#FFD700", line_width=2, alpha=0.7, legend_label="MW ≀ 500")
292
+ p1.line([df['MW'].min()-50, df['MW'].max()+50], [5, 5], line_dash="dashed", line_color="#FFD700", line_width=2, alpha=0.7, legend_label="LogP ≀ 5")
 
 
 
 
 
293
  style_plot(p1, "Molecular Weight (Da)", "LogP", "Lipinski Rule: MW vs LogP")
294
 
 
295
  p2 = figure(title="Hydrogen Bonding Profile", tools=[scatter_hover, 'pan,wheel_zoom,box_zoom,reset,save'], **plot_config)
296
+ p2.scatter('HBD', 'HBA', source=source, legend_group='Category', color=color_mapper, size=12, alpha=0.8, line_color='white', line_width=0.5)
297
+ p2.line([5, 5], [df['HBA'].min()-1, df['HBA'].max()+1], line_dash="dashed", line_color="#FFD700", line_width=2, alpha=0.7, legend_label="HBD ≀ 5")
298
+ p2.line([df['HBD'].min()-1, df['HBD'].max()+1], [10, 10], line_dash="dashed", line_color="#FFD700", line_width=2, alpha=0.7, legend_label="HBA ≀ 10")
 
 
 
 
 
 
299
  style_plot(p2, "Hydrogen Bond Donors", "Hydrogen Bond Acceptors", "Lipinski Rule: Hydrogen Bonding")
300
 
 
301
  p3 = figure(title="Molecular Flexibility & Polarity", tools=[scatter_hover, 'pan,wheel_zoom,box_zoom,reset,save'], **plot_config)
302
+ p3.scatter('TPSA', 'RotBonds', source=source, legend_group='Category', color=color_mapper, size=12, alpha=0.8, line_color='white', line_width=0.5)
303
+ p3.line([140, 140], [df['RotBonds'].min()-1, df['RotBonds'].max()+1], line_dash="dashed", line_color="#FFD700", line_width=2, alpha=0.7, legend_label="TPSA ≀ 140")
304
+ p3.line([df['TPSA'].min()-10, df['TPSA'].max()+10], [10, 10], line_dash="dashed", line_color="#FFD700", line_width=2, alpha=0.7, legend_label="RotBonds ≀ 10")
 
 
 
 
 
 
305
  style_plot(p3, "Topological Polar Surface Area (Γ…Β²)", "Rotatable Bonds", "Drug Permeability Indicators")
306
+
 
 
 
307
  p4_config = plot_config.copy()
308
+ p4_config['tools'] = "hover"
309
  p4_config.update({'x_range': (-1.0, 1.0), 'y_range': (-1.0, 1.0)})
310
  p4 = figure(title="Drug-Likeness Distribution", **p4_config)
 
 
311
  counts = df['Category'].value_counts()
312
+ data = pd.DataFrame({'category': counts.index, 'value': counts.values})
313
+ data['angle'] = data['value']/data['value'].sum() * 2*pi
314
+ data['color'] = [colors[0] if cat == 'Drug-Like' else colors[1] for cat in counts.index]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  wedge_renderer = p4.annular_wedge(x=0, y=0, inner_radius=0.25, outer_radius=0.45,
316
+ start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
317
  line_color="white", line_width=3, fill_color='color',
318
+ legend_field='category', source=data)
319
+ p4.text([0], [0], text=[f"{len(df)}\nCompounds"], text_align="center", text_baseline="middle", text_color="white", text_font_size="14pt", text_font_style="bold")
320
+ donut_hover = HoverTool(tooltips=[("Category", "@category"), ("Count", "@value")], renderers=[wedge_renderer])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  p4.add_tools(donut_hover)
 
322
  style_plot(p4, "", "", "Compound Classification")
323
  p4.axis.visible = False
324
  p4.grid.visible = False
325
 
326
+ grid = gridplot([[p1, p2], [p3, p4]], sizing_mode='scale_width', toolbar_location='right', merge_tools=True)
 
 
 
327
  return grid, "βœ… Generated enhanced molecular properties dashboard."
328
 
329
  # ===== Phase 2 Functions =====
330
  def get_phase2_molecules():
331
  """Returns an expanded list of common drugs with corrected SMILES."""
332
  return {
333
+ 'Paracetamol': 'CC(=O)Nc1ccc(O)cc1', 'Ibuprofen': 'CC(C)Cc1ccc(C(C)C(=O)O)cc1',
334
+ 'Aspirin': 'CC(=O)Oc1ccccc1C(=O)O', 'Naproxen': 'C[C@H](C(=O)O)c1ccc2cc(OC)ccc2c1',
335
+ 'Diazepam': 'CN1C(=O)CN=C(c2ccccc2)c2cc(Cl)ccc12', 'Metformin': 'CN(C)C(=N)N=C(N)N',
336
+ 'Loratadine': 'CCOC(=O)N1CCC(C(c2ccc(Cl)cc2)c2ccccn2)CC1', 'Morphine': 'C[N@]1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](O)C=C[C@H]3[C@H]1C5',
337
+ 'Cetirizine': 'O=C(O)COCCOc1ccc(cc1)C(c1ccccc1)N1CCN(CC1)CCO', 'Fluoxetine': 'CNCCC(c1ccccc1)Oc1ccc(C(F)(F)F)cc1',
338
+ 'Amoxicillin': 'C[C@@]1([C@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)[C@@H](N)c3ccc(O)cc3)C(=O)O)C', 'Atorvastatin': 'CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(c1)c1ccc(F)cc1',
339
+ 'Ciprofloxacin': 'O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc12', 'Warfarin': 'O=C(c1ccccc1)C(c1oc2ccccc2c1=O)C',
 
 
 
 
 
 
 
340
  'Furosemide': 'O=C(O)c1cc(Cl)c(NC2CO2)c(c1)S(=O)(=O)N',
341
  }
342
 
 
365
  log += f"βœ… Predicted ADMET properties for {len(df)} molecules.\n"
366
  return df, log
367
 
 
 
368
  def visualize_molecule_2d_3d(smiles: str, name: str):
369
  """Generates a side-by-side 2D SVG and 3D py3Dmol HTML view for a single molecule."""
370
  log = ""
 
470
  return combined_html, log
471
  except Exception as e:
472
  return f"<p>Error visualizing {name}: {e}</p>", f"❌ Error visualizing {name}: {e}"
473
+
474
+ def visualize_protein_ligand_interaction(pdb_data: str, pdb_id: str, ligand_resn: str):
475
+ """
476
+ Generates a protein-ligand interaction visualization using py3Dmol.
477
+ """
478
+ if not pdb_data:
479
+ return None, "Cannot generate interaction view: No PDB data provided."
480
+
481
  try:
482
+ viewer = py3Dmol.view(width='100%', height=650)
483
  viewer.setBackgroundColor('#1C1C1C')
484
+
485
+ # Add the protein structure
486
  viewer.addModel(pdb_data, "pdb")
487
+
488
+ # Style the protein (cartoon representation)
489
+ viewer.setStyle({'cartoon': {'color': 'lightblue', 'opacity': 0.8}})
490
+
491
+ # Highlight the ligand if specified
492
+ if ligand_resn:
493
+ viewer.addStyle({'resn': ligand_resn}, {'stick': {'colorscheme': 'greenCarbon', 'radius': 0.2}})
494
+ viewer.addStyle({'resn': ligand_resn}, {'sphere': {'scale': 0.3, 'colorscheme': 'greenCarbon'}})
495
+
496
+ # Add surface representation for binding site
497
+ viewer.addSurface(py3Dmol.VDW, {'opacity': 0.2, 'color': 'white'}, {'resn': ligand_resn})
498
+
499
+ viewer.zoomTo({'resn': ligand_resn} if ligand_resn else {})
500
+
501
  html = viewer._make_html()
502
+ log = f"βœ… Generated protein-ligand interaction view for {pdb_id} with ligand {ligand_resn}."
 
 
503
  return html, log
504
+
505
  except Exception as e:
506
+ return None, f"❌ Interaction visualization error: {e}"
507
+
508
  # ===== Phase 3 Functions =====
509
  def get_phase3_molecules():
510
  return {
 
563
  return pd.DataFrame(summary), "βœ… Generated AI/ML documentation summary."
564
 
565
  def simulate_rwd_analysis(adverse_event_text):
566
+ """
567
+ Analyzes simulated adverse event text and generates a DataFrame and Bokeh plot.
568
+ """
569
  np.random.seed(42)
570
+ base_events = list(np.random.choice(
571
+ ['headache', 'nausea', 'fatigue', 'dizziness', 'rash', 'fever'],
572
+ 100,
573
+ p=[0.25, 0.2, 0.15, 0.15, 0.1, 0.15]
574
+ ))
575
+
576
+ user_terms = [word.lower() for word in re.findall(r'\b[a-zA-Z]{3,}\b', adverse_event_text)]
577
 
578
+ all_events = base_events + user_terms
 
579
 
580
+ events_df = pd.DataFrame(all_events, columns=['Adverse_Event'])
581
+ event_counts = events_df['Adverse_Event'].value_counts().nlargest(10).sort_values(ascending=False)
582
 
583
+ results_df = event_counts.reset_index()
584
+ results_df.columns = ['Adverse_Event', 'Frequency']
 
 
585
 
586
+ log = f"βœ… Analyzed {len(all_events)} total event reports. Identified {len(event_counts)} unique adverse events for plotting.\n"
587
 
588
+ # Create Bokeh Plot
589
+ source = ColumnDataSource(results_df)
590
+ y_range = results_df['Adverse_Event'].tolist()[::-1]
591
 
592
+ hover = HoverTool(tooltips=[("Event", "@Adverse_Event"),("Frequency", "@Frequency")])
 
 
 
 
 
 
593
 
594
+ p = figure(
595
+ y_range=y_range, height=450, title="Top 10 Reported Adverse Events",
596
+ sizing_mode='stretch_width', tools="pan,wheel_zoom,box_zoom,reset,save",
597
+ )
598
+ p.add_tools(hover)
599
 
600
+ p.hbar(
601
+ y='Adverse_Event', right='Frequency', source=source, height=0.7,
602
+ color='#00A0FF', line_color='white', legend_label="Event Frequency"
603
+ )
604
 
605
+ # Style the plot for a dark theme
606
+ p.background_fill_color = "#1C1C1C"
607
+ p.border_fill_color = "#1C1C1C"
608
+ p.outline_line_color = '#333333'
609
+ p.title.text_color = "white"
610
+ p.title.text_font_size = '16pt'
611
+ p.title.align = "center"
612
+ p.xaxis.axis_label = "Frequency Count"
613
+ p.yaxis.axis_label = "Adverse Event"
614
+ p.axis.axis_label_text_color = "#CCCCCC"
615
+ p.axis.axis_label_text_font_size = "12pt"
616
+ p.axis.major_label_text_color = "#AAAAAA"
617
+ p.axis.major_label_text_font_size = "10pt"
618
+ p.grid.grid_line_alpha = 0.3
619
+ p.grid.grid_line_color = "#444444"
620
+ p.x_range.start = 0
621
+ p.legend.location = "top_right"
622
+ p.legend.background_fill_color = "#2A2A2A"
623
+ p.legend.background_fill_alpha = 0.7
624
+ p.legend.border_line_color = "#444444"
625
+ p.legend.label_text_color = "white"
626
+
627
+ return results_df, p, log
628
+
629
+ def get_ethical_framework():
630
+ framework = {'Principle': ['Beneficence', 'Non-maleficence', 'Fairness', 'Transparency'],
631
+ 'Implementation Strategy': [
632
+ 'AI models prioritize patient outcomes and clinical efficacy.',
633
+ 'Toxicity prediction and pharmacovigilance models aim to minimize patient harm.',
634
+ 'Algorithms are audited for demographic bias in training data and predictions.',
635
+ 'Model cards and SHAP values are provided for key decision-making processes.'
636
+ ]}
637
+ return pd.DataFrame(framework), "βœ… Generated Ethical AI Framework summary."
638
+
639
+ # --- 3. Streamlit UI Layout ---
640
+
641
+ # Initialize session state variables
642
+ if 'active_tab' not in st.session_state: st.session_state.active_tab = "Phase 1: Target Identification"
643
+ if 'log_p1' not in st.session_state: st.session_state.log_p1 = "Status logs will appear here."
644
+ if 'log_p2' not in st.session_state: st.session_state.log_p2 = "Status logs will appear here."
645
+ if 'log_p3' not in st.session_state: st.session_state.log_p3 = "Status logs will appear here."
646
+ if 'log_p4' not in st.session_state: st.session_state.log_p4 = "Status logs will appear here."
647
  if 'results_p1' not in st.session_state: st.session_state.results_p1 = {}
 
648
  if 'results_p2' not in st.session_state: st.session_state.results_p2 = {}
 
649
  if 'results_p3' not in st.session_state: st.session_state.results_p3 = {}
 
650
  if 'results_p4' not in st.session_state: st.session_state.results_p4 = {}
651
 
652
+ # --- Header ---
653
+ st.title("πŸ”¬ AI-Powered Drug Discovery Pipeline")
654
+ st.markdown("An integrated application demonstrating a four-phase computational drug discovery workflow.")
655
+
656
+ # --- Main Tabs for Each Phase ---
657
  tab1, tab2, tab3, tab4 = st.tabs([
658
+ "**Phase 1:** Target Identification",
659
+ "**Phase 2:** Hit Discovery & ADMET",
660
+ "**Phase 3:** Lead Optimization",
661
+ "**Phase 4:** Pre-clinical & RWE"
662
  ])
663
 
664
+ # --- Phase 1: Target Identification ---
665
  with tab1:
666
+ st.header("Phase 1: Target Identification & Initial Analysis")
667
+ st.markdown("""
668
+ In this initial phase, we identify and analyze a biological target (e.g., a protein) implicated in a disease.
669
+ We fetch its 3D structure and sequence data, then evaluate a set of initial compounds for their drug-like properties.
670
+ """)
671
+
672
+ st.subheader("Inputs & Controls")
673
+
674
+ pdb_id_input = st.text_input("Enter PDB ID (e.g., 2HU4 for Neuraminidase)", "2HU4")
675
+ protein_id_input = st.text_input("Enter NCBI Protein ID (e.g., P03468 for Neuraminidase)", "P03468")
676
+
677
+ st.markdown("---")
678
+ st.write("**Analyze Sample Compounds:**")
679
+ sample_molecules = create_sample_molecules()
680
+ selected_molecules = st.multiselect(
681
+ "Select from known drugs:",
682
+ options=list(sample_molecules.keys()),
683
+ default=["Oseltamivir", "Aspirin"]
684
+ )
685
+
686
+ if st.button("πŸš€ Run Phase 1 Analysis", key="run_p1"):
687
+ with st.spinner("Fetching data and calculating properties..."):
688
+ full_log = "--- Phase 1 Analysis Started ---\n"
689
+
690
+ pdb_data, log_pdb = fetch_pdb_structure(pdb_id_input)
691
+ full_log += log_pdb
692
+ log_fasta = fetch_fasta_sequence(protein_id_input)
693
+ full_log += log_fasta
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
694
 
695
+ smiles_to_analyze = {name: sample_molecules[name] for name in selected_molecules}
696
+ properties_df, log_props = calculate_molecular_properties(smiles_to_analyze)
697
  full_log += log_props
698
 
699
+ analysis_df, display_df, log_likeness = assess_drug_likeness(properties_df)
700
+ full_log += log_likeness
701
 
702
+ protein_view_html, log_3d = visualize_protein_3d(pdb_data, title=f"PDB: {pdb_id_input}")
703
+ full_log += log_3d
704
+
705
+ dashboard_plot, log_dash = plot_properties_dashboard(analysis_df)
706
+ full_log += log_dash
707
+
708
+ full_log += "\n--- Phase 1 Analysis Complete ---"
709
+ st.session_state.log_p1 = full_log
710
 
 
 
 
711
  st.session_state.results_p1 = {
712
+ 'pdb_data': pdb_data,
713
+ 'protein_view': protein_view_html,
714
+ 'properties_df': display_df,
715
+ 'dashboard': dashboard_plot
 
716
  }
 
 
 
 
 
717
 
718
  st.text_area("Status & Logs", st.session_state.log_p1, height=200, key="log_p1_area")
719
+
720
+ st.subheader("Results")
721
+ if not st.session_state.results_p1:
722
+ st.info("Click 'Run Phase 1 Analysis' to generate and display results.")
723
+ else:
724
  res1 = st.session_state.results_p1
725
+ p1_tabs = st.tabs(["Protein Structure", "Compound Properties Dashboard"])
726
+
727
  with p1_tabs[0]:
728
+ st.subheader(f"3D Structure for PDB ID: {pdb_id_input}")
729
+ if res1.get('protein_view'):
730
+ st.components.v1.html(res1['protein_view'], height=600, scrolling=False)
 
731
  else:
732
+ st.warning("Could not display 3D structure. Check PDB ID and logs.")
733
+
734
  with p1_tabs[1]:
735
+ st.subheader("Physicochemical Properties Analysis")
736
+ if res1.get('dashboard'):
737
+ st.bokeh_chart(res1['dashboard'], use_container_width=True)
738
+ st.dataframe(res1.get('properties_df', pd.DataFrame()), use_container_width=True, hide_index=True)
739
+
740
+
741
+ # --- Phase 2: Hit Discovery & ADMET ---
 
 
 
 
 
742
  with tab2:
743
+ st.header("Phase 2: Virtual Screening & Early ADMET")
744
+ st.markdown("""
745
+ This phase simulates a virtual screening process to identify 'hits' from a larger library of compounds.
746
+ We predict their binding affinity to the target and assess their basic ADMET (Absorption, Distribution,
747
+ Metabolism, Excretion, Toxicity) profiles.
748
+ """)
749
+
750
+ st.subheader("Inputs & Controls")
751
+
752
+ p2_molecules = get_phase2_molecules()
753
+ st.info(f"A library of {len(p2_molecules)} compounds is ready for screening.")
754
+
755
+ p2_pdb_id = st.text_input("Enter PDB ID for Interaction (e.g., 2HU4)", "2HU4", key="p2_pdb")
756
+ p2_ligand_resn = st.text_input("Ligand Residue Name in PDB (e.g., G39 for Oseltamivir)", "G39", key="p2_ligand")
757
+
758
+ if st.button("πŸš€ Run Phase 2 Analysis", key="run_p2"):
759
+ with st.spinner("Running virtual screening and ADMET predictions..."):
760
+ full_log = "--- Phase 2 Analysis Started ---\n"
761
 
762
+ screening_df, log_screen = simulate_virtual_screening(p2_molecules)
763
+ full_log += log_screen
764
+ admet_df, log_admet = predict_admet_properties(p2_molecules)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
765
  full_log += log_admet
766
 
767
+ merged_df = pd.merge(screening_df, admet_df, on="Molecule")
768
+
769
+ pdb_data, log_pdb_p2 = fetch_pdb_structure(p2_pdb_id)
770
+ full_log += log_pdb_p2
771
+
772
+ interaction_view, log_interact = visualize_protein_ligand_interaction(pdb_data, p2_pdb_id, p2_ligand_resn)
773
+ full_log += log_interact
 
 
 
 
 
774
 
775
+ full_log += "\n--- Phase 2 Analysis Complete ---"
776
+ st.session_state.log_p2 = full_log
777
  st.session_state.results_p2 = {
778
+ 'merged_df': merged_df,
779
+ 'interaction_view': interaction_view
 
 
 
780
  }
781
+
 
 
 
 
 
782
  st.text_area("Status & Logs", st.session_state.log_p2, height=200, key="log_p2_area")
783
 
784
+ st.subheader("Results")
785
+ if not st.session_state.results_p2:
786
+ st.info("Click 'Run Phase 2 Analysis' to generate and display results.")
787
+ else:
788
  res2 = st.session_state.results_p2
789
+ p2_tabs = st.tabs(["Screening & ADMET Results", "Protein-Ligand Interaction"])
790
+
791
  with p2_tabs[0]:
792
+ st.subheader("Virtual Screening & Early ADMET Predictions")
793
+ st.dataframe(res2.get('merged_df', pd.DataFrame()), use_container_width=True, hide_index=True)
794
+
 
 
 
 
795
  with p2_tabs[1]:
796
+ st.subheader(f"Simulated Interaction for PDB {p2_pdb_id} with Ligand {p2_ligand_resn}")
797
+ if res2.get('interaction_view'):
798
+ st.components.v1.html(res2['interaction_view'], height=700, scrolling=False)
 
799
  else:
800
+ st.warning("Could not display interaction view. Check inputs and logs.")
 
 
 
 
 
 
 
 
 
 
 
801
 
802
+ # --- Phase 3: Lead Optimization ---
803
  with tab3:
804
+ st.header("Phase 3: Lead Compound Optimization")
805
+ st.markdown("""
806
+ In lead optimization, promising 'hit' compounds are refined to improve their efficacy and safety.
807
+ Here, we analyze a few selected lead candidates, perform more detailed property calculations,
808
+ and predict their toxicity risk using a simulated machine learning model.
809
+ """)
810
+
811
+ st.subheader("Inputs & Controls")
812
+
813
+ p3_molecules = get_phase3_molecules()
814
+ selected_leads = st.multiselect(
815
+ "Select lead compounds to optimize:",
816
+ options=list(p3_molecules.keys()),
817
+ default=['Oseltamivir', 'Remdesivir']
818
+ )
819
+
820
+ if st.button("πŸš€ Run Phase 3 Analysis", key="run_p3"):
821
+ with st.spinner("Analyzing lead compounds and predicting toxicity..."):
822
+ full_log = "--- Phase 3 Analysis Started ---\n"
823
+
824
+ smiles_to_analyze_p3 = {name: p3_molecules[name] for name in selected_leads}
825
+
826
+ comp_props_df, log_comp = calculate_comprehensive_properties(smiles_to_analyze_p3)
827
+ full_log += log_comp
828
+
829
+ toxicity_df, log_tox = predict_toxicity(comp_props_df)
830
+ full_log += log_tox
831
+
832
+ final_df = pd.merge(comp_props_df, toxicity_df, on="Compound")
833
+
834
+ visuals = {}
835
+ for name, smiles in smiles_to_analyze_p3.items():
836
+ html_view, log_vis = visualize_molecule_2d_3d(smiles, name)
837
+ visuals[name] = html_view
838
+ full_log += log_vis
839
+
840
+ full_log += "\n--- Phase 3 Analysis Complete ---"
841
+ st.session_state.log_p3 = full_log
842
+ st.session_state.results_p3 = {
843
+ 'final_df': final_df,
844
+ 'visuals': visuals
845
+ }
846
 
847
  st.text_area("Status & Logs", st.session_state.log_p3, height=200, key="log_p3_area")
848
 
849
+ st.subheader("Results")
850
+ if not st.session_state.results_p3:
851
+ st.info("Click 'Run Phase 3 Analysis' to generate and display results.")
852
+ else:
853
  res3 = st.session_state.results_p3
854
+ st.subheader("Lead Compound Analysis & Toxicity Prediction")
855
+ st.dataframe(res3.get('final_df', pd.DataFrame()), use_container_width=True, hide_index=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
856
 
857
+ st.subheader("2D & 3D Molecular Structures")
858
+ for name, visual_html in res3.get('visuals', {}).items():
859
+ st.components.v1.html(visual_html, height=430, scrolling=False)
860
+
861
+
862
+ # --- Phase 4: Pre-clinical & RWE ---
863
+ with tab4:
864
+ st.header("Phase 4: Simulated Pre-clinical & Real-World Evidence (RWE)")
865
+ st.markdown("""
866
+ This final phase simulates post-market analysis. We analyze text data for adverse events (pharmacovigilance)
867
+ and present documentation related to the AI models and ethical frameworks that would be required for regulatory submission.
868
+ """)
869
+
870
+ st.subheader("Inputs & Controls")
871
+
872
+ rwd_input = st.text_area(
873
+ "Enter simulated adverse event report text:",
874
+ "Patient reports include instances of headache, severe nausea, and occasional skin rash. Some noted dizziness after taking the medication.",
875
+ height=150
876
+ )
877
+
878
+ if st.button("πŸš€ Run Phase 4 Analysis", key="run_p4"):
879
+ with st.spinner("Analyzing real-world data and generating reports..."):
880
+ full_log = "--- Phase 4 Analysis Started ---\n"
881
+
882
+ reg_df, log_reg = get_regulatory_summary()
883
+ full_log += log_reg
884
+
885
+ eth_df, log_eth = get_ethical_framework()
886
+ full_log += log_eth
887
+
888
+ rwd_df, plot_bar, log_rwd = simulate_rwd_analysis(rwd_input)
889
+ full_log += log_rwd
890
+ full_log += "\n--- Phase 4 Analysis Complete ---"
891
+ st.session_state.log_p4 = full_log
892
+
893
+ st.session_state.results_p4 = {
894
+ 'rwd_df': rwd_df,
895
+ 'plot_bar': plot_bar,
896
+ 'reg_df': reg_df,
897
+ 'eth_df': eth_df
898
+ }
899
 
900
  st.text_area("Status & Logs", st.session_state.log_p4, height=200, key="log_p4_area")
901
 
902
+ st.subheader("Results")
903
+ if not st.session_state.results_p4:
904
+ st.info("Click 'Run Phase 4 Analysis' to generate and display results.")
905
+ else:
906
  res4 = st.session_state.results_p4
907
  p4_tabs = st.tabs(["Pharmacovigilance Analysis", "Regulatory & Ethical Frameworks"])
908
+
909
  with p4_tabs[0]:
910
  st.subheader("Simulated Adverse Event Analysis")
911
  if res4.get('plot_bar'):
912
+ st.bokeh_chart(res4['plot_bar'], use_container_width=True)
913
  st.dataframe(res4.get('rwd_df', pd.DataFrame()), use_container_width=True, hide_index=True)
914
 
915
  with p4_tabs[1]:
916
+ st.subheader("AI/ML Model Regulatory Summary")
917
+ st.dataframe(res4.get('reg_df', pd.DataFrame()), use_container_width=True, hide_index=True)
918
+
919
+ st.subheader("Ethical AI Framework")
920
+ st.dataframe(res4.get('eth_df', pd.DataFrame()), use_container_width=True, hide_index=True)