Spaces:

alidenewade
/

mol-lang-lab

Sleeping

App Files Files Community

alidenewade commited on Jun 23

Commit

98e9d9e

verified ·

1 Parent(s): 87445f0

Update app.py

Browse files

Files changed (1) hide show

app.py +229 -177

app.py CHANGED Viewed

@@ -1,73 +1,137 @@
-# app.py
 import streamlit as st
 import torch
-from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline
 from rdkit import Chem
-from rdkit.Chem import Draw, AllChem
 from rdkit.Chem.Draw import MolToImage
 import pandas as pd
 import io
 import base64
 import logging
 import py3Dmol
-# Set up logging to monitor effects
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # --- Optimized Model Loading ---
 @st.cache_resource
 def load_optimized_models():
-    """Load models for CPU directly, bypassing quantization and GPU checks."""
-    device = "cpu" # Force device to CPU
-    torch_dtype = torch.float32 # Force full precision for CPU
     logger.info(f"Loading models on device: {device} with dtype: {torch_dtype}")
     # Model names
     model_name = "seyonec/PubChem10M_SMILES_BPE_450k"
-    # Load tokenizer
     fill_mask_tokenizer = AutoTokenizer.from_pretrained(model_name)
-    # Load model with standard settings for CPU
     try:
         fill_mask_model = AutoModelForMaskedLM.from_pretrained(
             model_name,
-            torch_dtype=torch_dtype,
-            device_map=None # No device mapping for plain CPU
         )
         # Set model to evaluation mode for inference
         fill_mask_model.eval()
-        # Create pipeline for CPU
         fill_mask_pipeline = pipeline(
             'fill-mask',
             model=fill_mask_model,
             tokenizer=fill_mask_tokenizer,
-            device=-1 # -1 means CPU
         )
-        logger.info("Models loaded successfully for CPU.")
         return fill_mask_tokenizer, fill_mask_model, fill_mask_pipeline
     except Exception as e:
-        logger.error(f"Error loading models on CPU: {e}")
-        st.error(f"Failed to load language model. Please try again. Error: {e}")
-        # Re-raise or handle as appropriate for app startup
-        raise # Critical error, app cannot proceed
-# Load models with optimizations
-fill_mask_tokenizer, fill_mask_model, fill_mask_pipeline = load_optimized_models()
-# --- Memory Management Utilities (now mostly a placeholder for CPU) ---
 def clear_gpu_cache():
-    """Placeholder for GPU cache clearing. Not effective on CPU."""
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-# --- Helper Functions from Notebook (adapted) ---
 def get_mol(smiles):
     """Converts SMILES to RDKit Mol object and Kekulizes it."""
     mol = Chem.MolFromSmiles(smiles)
@@ -109,50 +173,66 @@ def get_image_with_highlight(mol, atomset=None, size=(300, 300)):
                      highlightAtomColors={i: highlight_color for i in valid_atomset} if valid_atomset else {})
     return img
-def mol_to_sdf_string(mol):
-    """Converts an RDKit Mol object to an SDF string."""
     if mol is None:
         return None
-    # Add 3D coordinates if not present
-    AllChem.EmbedMolecule(mol, AllChem.ETKDG())
-    AllChem.UFFOptimizeMolecule(mol)
-    return Chem.MolToMolBlock(mol)
-def render_mol_3d(sdf_string, width=300, height=300):
-    """Renders a 3D molecule using py3Dmol."""
-    if sdf_string is None:
-        return ""
-    viewer = py3Dmol.view(width=width, height=height)
-    viewer.setBackgroundColor('#1C1C1C')
-    viewer.addModel(sdf_string, 'sdf')
-    viewer.setStyle({'stick':{}}) # Display as sticks
     viewer.zoomTo()
-    # Embed the viewer HTML into Streamlit
-    return viewer.to_html()
 # --- Streamlit Interface Functions ---
 def predict_and_visualize_masked_smiles(smiles_mask, substructure_smarts_highlight="CC=CC"):
     """
     Predicts masked tokens in a SMILES string, shows scores, and visualizes molecules.
-    Returns 5 image paths and a status message.
     """
     if fill_mask_tokenizer.mask_token not in smiles_mask:
         st.error("Error: Input SMILES must contain a mask token (e.g., <mask>).")
-        return pd.DataFrame(), [None]*5, [None]*5, "Error: Input SMILES must contain a mask token (e.g., <mask>)."
     try:
         with torch.no_grad():
             predictions = fill_mask_pipeline(smiles_mask, top_k=10)
     except Exception as e:
         clear_gpu_cache()
         st.error(f"Error during prediction: {str(e)}")
-        return pd.DataFrame(), [None]*5, [None]*5, f"Error during prediction: {str(e)}"
     results_data = []
-    image_2d_list = []
-    image_3d_list = []
     valid_predictions_count = 0
     for pred in predictions:
@@ -165,157 +245,129 @@ def predict_and_visualize_masked_smiles(smiles_mask, substructure_smarts_highlig
         mol = get_mol(predicted_smiles)
         if mol:
             results_data.append({"Predicted SMILES": predicted_smiles, "Score": f"{score:.4f}"})
             atom_matches_indices = []
             if substructure_smarts_highlight:
                 matches = find_matches_one(mol, substructure_smarts_highlight)
                 if matches:
-                    atom_matches_indices = list(matches[0]) # Highlight first match
-            img_2d = get_image_with_highlight(mol, atomset=atom_matches_indices)
-            image_2d_list.append(img_2d)
-            # For 3D, we need an SDF string
-            sdf_string = mol_to_sdf_string(mol)
-            img_3d_html = render_mol_3d(sdf_string, width=300, height=300)
-            image_3d_list.append(img_3d_html)
-            valid_predictions_count += 1
-    # Pad image lists if fewer than 5 valid predictions
-    while len(image_2d_list) < 5:
-        image_2d_list.append(None)
-        image_3d_list.append(None)
-    df_results = pd.DataFrame(results_data)
     clear_gpu_cache()
-    status_message = "Prediction successful." if valid_predictions_count > 0 else "No valid molecules found for top predictions."
-    return df_results, image_2d_list, image_3d_list, status_message
-def display_molecule_with_3d(smiles_string):
     """
-    Displays a 2D image and 3D visualization of a molecule from its SMILES string.
     """
     if not smiles_string:
-        return None, None, "Please enter a SMILES string."
     mol = get_mol(smiles_string)
     if mol is None:
-        return None, None, "Invalid SMILES string."
-    img_2d = MolToImage(mol, size=(400, 400), fitImage=True)
-    sdf_string = mol_to_sdf_string(mol)
-    img_3d_html = render_mol_3d(sdf_string, width=400, height=400)
-    return img_2d, img_3d_html, "Molecule displayed."
-# --- Streamlit UI Definition ---
-# Set wide mode and background color
-st.set_page_config(layout="wide")
-st.markdown(
-    """
-    <style>
-    .stApp {
-        background-color: rgb(28,28,28);
-        color: white; /* Ensure text is visible on dark background */
-    }
-    .stDataFrame {
-        color: black; /* Default DataFrame text color */
-    }
-    h1, h2, h3, h4, h5, h6, .stMarkdown {
-        color: white;
-    }
-    .css-1d391kg, .css-1dp5dn1 { /* Target Streamlit's main content and sidebar */
-        color: white;
-    }
-    .streamlit-expanderContent {
-        background-color: rgb(40,40,40); /* Slightly lighter background for expanders */
-        border-radius: 10px;
-        padding: 10px;
-    }
-    /* Style for text inputs and buttons */
-    .stTextInput>div>div>input {
-        background-color: rgb(50,50,50);
-        color: white;
-        border-radius: 5px;
-        border: 1px solid rgb(70,70,70);
-    }
-    .stButton>button {
-        background-color: rgb(0,128,255); /* Blue button */
-        color: white;
-        border-radius: 8px;
-        padding: 10px 20px;
-        border: none;
-        transition: background-color 0.3s ease;
-    }
-    .stButton>button:hover {
-        background-color: rgb(0,100,200);
-    }
-    </style>
-    """,
-    unsafe_allow_html=True
-)
-st.title("ChemBERTa SMILES Utilities Dashboard")
-tab1, tab2 = st.tabs(["Masked SMILES Prediction", "Molecule Viewer"])
-with tab1:
-    st.markdown("Enter a SMILES string with a `<mask>` token (e.g., `C1=CC=CC<mask>C1`) to predict possible completions.")
-    col1, col2 = st.columns([2, 1])
     with col1:
-        smiles_input_masked = st.text_input("SMILES String with Mask", value="C1=CC=CC<mask>C1")
     with col2:
-        substructure_input = st.text_input("Substructure to Highlight (SMARTS)", value="C=C")
-    if st.button("Predict and Visualize", key="predict_button"):
-        with st.spinner("Predicting and visualizing..."):
-            df_predictions, img_2d_list, img_3d_list, status_msg = predict_and_visualize_masked_smiles(
-                smiles_input_masked, substructure_input
             )
-            st.write(status_msg)
-            if not df_predictions.empty:
-                st.subheader("Top Predictions & Scores")
-                st.dataframe(df_predictions, use_container_width=True)
-                st.subheader("Predicted Molecule Visualizations (Top 5 Valid)")
-                for i in range(5):
-                    if img_2d_list[i] is not None:
-                        st.markdown(f"**Prediction {i+1}**")
-                        cols_img = st.columns(2)
-                        with cols_img[0]:
-                            st.image(img_2d_list[i], caption=f"2D Prediction {i+1}", use_column_width=True)
-                        with cols_img[1]:
-                            st.components.v1.html(img_3d_list[i], height=300)
-                    else:
-                        if i < len(df_predictions): # Only show 'No visualization' if there was a prediction attempt
-                             st.markdown(f"**Prediction {i+1}**: No visualization available (invalid SMILES or error).")
-with tab2:
-    st.markdown("Enter a SMILES string to display its 2D and 3D structure.")
-    smiles_input_viewer = st.text_input("SMILES String", value="C1=CC=CC=C1", key="viewer_smiles_input")
-    if st.button("View Molecule", key="view_button"):
-        with st.spinner("Displaying molecule..."):
-            img_2d_viewer, img_3d_viewer_html, status_viewer_msg = display_molecule_with_3d(smiles_input_viewer)
-            st.write(status_viewer_msg)
-            if img_2d_viewer is not None:
-                cols_viewer = st.columns(2)
-                with cols_viewer[0]:
-                    st.image(img_2d_viewer, caption="2D Molecule Structure", use_column_width=True)
-                with cols_viewer[1]:
-                    st.components.v1.html(img_3d_viewer_html, height=400)
-            else:
-                st.warning("Could not display molecule. Please check the SMILES string.")

 import streamlit as st
 import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline, BitsAndBytesConfig
 from rdkit import Chem
+from rdkit.Chem import Draw, rdFMCS, AllChem
 from rdkit.Chem.Draw import MolToImage
 import pandas as pd
 import io
 import base64
 import logging
 import py3Dmol
+from stmol import showmol
+# Set up logging to monitor quantization effects
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Page configuration
+st.set_page_config(
+    page_title="ChemBERTa SMILES Utilities Dashboard",
+    page_icon="🧪",
+    layout="wide"
+)
+# --- Quantization Configuration ---
+@st.cache_resource
+def get_quantization_config():
+    """
+    Configure 8-bit quantization for model optimization.
+    Falls back gracefully if bitsandbytes is not available.
+    """
+    try:
+        # 8-bit quantization configuration - good balance of speed and quality
+        quantization_config = BitsAndBytesConfig(
+            load_in_8bit=True,
+            bnb_8bit_compute_dtype=torch.float16,
+            bnb_8bit_use_double_quant=True,  # Nested quantization for better compression
+        )
+        logger.info("8-bit quantization configuration loaded successfully")
+        return quantization_config
+    except ImportError:
+        logger.warning("bitsandbytes not available, falling back to standard loading")
+        return None
+    except Exception as e:
+        logger.warning(f"Quantization setup failed: {e}, using standard loading")
+        return None
+def get_torch_dtype():
+    """Get appropriate torch dtype based on available hardware."""
+    if torch.cuda.is_available():
+        return torch.float16  # Use half precision on GPU
+    else:
+        return torch.float32  # Keep full precision on CPU
 # --- Optimized Model Loading ---
 @st.cache_resource
 def load_optimized_models():
+    """Load models with quantization and other optimizations."""
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    torch_dtype = get_torch_dtype()
+    quantization_config = get_quantization_config()
     logger.info(f"Loading models on device: {device} with dtype: {torch_dtype}")
     # Model names
     model_name = "seyonec/PubChem10M_SMILES_BPE_450k"
+    # Load tokenizer (doesn't need quantization)
     fill_mask_tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Load model with quantization if available
+    model_kwargs = {
+        "torch_dtype": torch_dtype,
+    }
+    if quantization_config is not None and torch.cuda.is_available(): # Quantization typically for GPU
+        model_kwargs["quantization_config"] = quantization_config
+        # device_map="auto" is often used with bitsandbytes for automatic distribution
+        model_kwargs["device_map"] = "auto"
+    elif torch.cuda.is_available():
+        model_kwargs["device_map"] = "auto" # For non-quantized GPU loading
+    else:
+        model_kwargs["device_map"] = None # For CPU
     try:
+        # Masked LM Model
         fill_mask_model = AutoModelForMaskedLM.from_pretrained(
             model_name,
+            **model_kwargs
         )
         # Set model to evaluation mode for inference
         fill_mask_model.eval()
+        # Create optimized pipeline
+        # Let pipeline infer device from model if possible, or set based on model's device
+        pipeline_device = fill_mask_model.device.index if hasattr(fill_mask_model.device, 'type') and fill_mask_model.device.type == "cuda" else -1
         fill_mask_pipeline = pipeline(
             'fill-mask',
             model=fill_mask_model,
             tokenizer=fill_mask_tokenizer,
+            device=pipeline_device, # Use model's device
         )
+        logger.info("Models loaded successfully with optimizations")
         return fill_mask_tokenizer, fill_mask_model, fill_mask_pipeline
     except Exception as e:
+        logger.error(f"Error loading optimized models: {e}")
+        # Fallback to standard loading
+        logger.info("Falling back to standard model loading...")
+        return load_standard_models(model_name)
+def load_standard_models(model_name):
+    """Fallback standard model loading without quantization."""
+    fill_mask_tokenizer = AutoTokenizer.from_pretrained(model_name)
+    fill_mask_model = AutoModelForMaskedLM.from_pretrained(model_name)
+    # Determine device for standard loading
+    device_idx = 0 if torch.cuda.is_available() else -1
+    fill_mask_pipeline = pipeline('fill-mask', model=fill_mask_model, tokenizer=fill_mask_tokenizer, device=device_idx)
+    if torch.cuda.is_available():
+        fill_mask_model.to("cuda")
+    return fill_mask_tokenizer, fill_mask_model, fill_mask_pipeline
+# --- Memory Management Utilities ---
 def clear_gpu_cache():
+    """Clear CUDA cache to free up memory."""
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+# --- Helper Functions ---
 def get_mol(smiles):
     """Converts SMILES to RDKit Mol object and Kekulizes it."""
     mol = Chem.MolFromSmiles(smiles)
                      highlightAtomColors={i: highlight_color for i in valid_atomset} if valid_atomset else {})
     return img
+def generate_3d_structure(mol):
+    """Generate 3D coordinates for a molecule."""
+    if mol is None:
+        return None
+    # Create a copy to avoid modifying the original
+    mol_3d = Chem.Mol(mol)
+    # Add hydrogens
+    mol_3d = Chem.AddHs(mol_3d)
+    # Generate 3D coordinates
+    try:
+        AllChem.EmbedMolecule(mol_3d, randomSeed=42)
+        AllChem.UFFOptimizeMolecule(mol_3d)
+        return mol_3d
+    except:
+        # If 3D generation fails, return None
+        return None
+def mol_to_3d_html(mol):
+    """Convert molecule to 3D HTML representation using py3Dmol."""
     if mol is None:
         return None
+    # Generate SDF string
+    sdf = Chem.MolToMolBlock(mol)
+    # Create 3D viewer
+    viewer = py3Dmol.view(width=400, height=400)
+    viewer.addModel(sdf, 'sdf')
+    viewer.setStyle({'stick': {}})
     viewer.zoomTo()
+    return viewer
 # --- Streamlit Interface Functions ---
 def predict_and_visualize_masked_smiles(smiles_mask, substructure_smarts_highlight="CC=CC"):
     """
     Predicts masked tokens in a SMILES string, shows scores, and visualizes molecules.
     """
+    # Load models
+    fill_mask_tokenizer, fill_mask_model, fill_mask_pipeline = load_optimized_models()
     if fill_mask_tokenizer.mask_token not in smiles_mask:
         st.error("Error: Input SMILES must contain a mask token (e.g., <mask>).")
+        return
     try:
+        # Use torch.no_grad() for inference to save memory
         with torch.no_grad():
             predictions = fill_mask_pipeline(smiles_mask, top_k=10)
     except Exception as e:
         clear_gpu_cache()
         st.error(f"Error during prediction: {str(e)}")
+        return
     results_data = []
+    valid_predictions = []
     valid_predictions_count = 0
     for pred in predictions:
         mol = get_mol(predicted_smiles)
         if mol:
             results_data.append({"Predicted SMILES": predicted_smiles, "Score": f"{score:.4f}"})
+            valid_predictions.append((mol, predicted_smiles, score))
+            valid_predictions_count += 1
+    if valid_predictions_count == 0:
+        st.warning("No valid molecules found for top predictions.")
+        return
+    # Display results table
+    df_results = pd.DataFrame(results_data)
+    st.subheader("Top Predictions & Scores")
+    st.dataframe(df_results, use_container_width=True)
+    # Display molecule visualizations
+    st.subheader("Predicted Molecule Visualizations")
+    for i, (mol, smiles, score) in enumerate(valid_predictions):
+        st.write(f"**Prediction {i+1}:** {smiles} (Score: {score:.4f})")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.write("**2D Structure:**")
             atom_matches_indices = []
             if substructure_smarts_highlight:
                 matches = find_matches_one(mol, substructure_smarts_highlight)
                 if matches:
+                    atom_matches_indices = list(matches[0])
+            img_2d = get_image_with_highlight(mol, atomset=atom_matches_indices)
+            if img_2d:
+                st.image(img_2d, use_column_width=True)
+        with col2:
+            st.write("**3D Structure:**")
+            mol_3d = generate_3d_structure(mol)
+            if mol_3d:
+                viewer_3d = mol_to_3d_html(mol_3d)
+                if viewer_3d:
+                    showmol(viewer_3d, height=400, width=400)
+            else:
+                st.write("3D structure generation failed for this molecule.")
+        st.divider()
+    # Clear cache after inference
     clear_gpu_cache()
+    st.success("Prediction successful!")
+def display_molecule_image(smiles_string):
     """
+    Displays both 2D and 3D images of a molecule from its SMILES string.
     """
     if not smiles_string:
+        st.error("Please enter a SMILES string.")
+        return
     mol = get_mol(smiles_string)
     if mol is None:
+        st.error("Invalid SMILES string.")
+        return
+    st.success("Molecule displayed successfully!")
+    col1, col2 = st.columns(2)
     with col1:
+        st.subheader("2D Structure")
+        img_2d = MolToImage(mol, size=(400, 400), fitImage=True)
+        st.image(img_2d, use_column_width=True)
     with col2:
+        st.subheader("3D Structure")
+        mol_3d = generate_3d_structure(mol)
+        if mol_3d:
+            viewer_3d = mol_to_3d_html(mol_3d)
+            if viewer_3d:
+                showmol(viewer_3d, height=400, width=400)
+        else:
+            st.write("3D structure generation failed for this molecule.")
+# --- Main Streamlit App ---
+def main():
+    st.title("🧪 ChemBERTa SMILES Utilities Dashboard")
+    # Sidebar for navigation
+    st.sidebar.title("Navigation")
+    tab_selection = st.sidebar.selectbox(
+        "Choose a tool:",
+        ["Masked SMILES Prediction", "Molecule Viewer"]
+    )
+    if tab_selection == "Masked SMILES Prediction":
+        st.header("Masked SMILES Prediction")
+        st.markdown("Enter a SMILES string with a `<mask>` token (e.g., `C1=CC=CC<mask>C1`) to predict possible completions.")
+        col1, col2 = st.columns(2)
+        with col1:
+            smiles_input_masked = st.text_input(
+                "SMILES String with Mask",
+                value="C1=CC=CC<mask>C1"
             )
+        with col2:
+            substructure_input = st.text_input(
+                "Substructure to Highlight (SMARTS)",
+                value="C=C"
+            )
+        if st.button("Predict and Visualize", type="primary"):
+            with st.spinner("Predicting masked SMILES..."):
+                predict_and_visualize_masked_smiles(smiles_input_masked, substructure_input)
+    elif tab_selection == "Molecule Viewer":
+        st.header("Molecule Viewer")
+        st.markdown("Enter a SMILES string to display its 2D and 3D structure.")
+        smiles_input_viewer = st.text_input(
+            "SMILES String",
+            value="C1=CC=CC=C1"
+        )
+        if st.button("View Molecule", type="primary"):
+            with st.spinner("Generating molecule structures..."):
+                display_molecule_image(smiles_input_viewer)
+if __name__ == "__main__":
+    main()