Spaces:

alidenewade
/

mol-lang-lab

Sleeping

App Files Files Community

alidenewade commited on Jun 23

Commit

35ed017

verified ·

1 Parent(s): 54eac43

Update app.py

Browse files

Files changed (1) hide show

app.py +286 -240

app.py CHANGED Viewed

@@ -1,142 +1,137 @@
 # app.py
-import gradio as gr
 import torch
 from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline, BitsAndBytesConfig
 from rdkit import Chem
-from rdkit.Chem import Draw, rdFMCS
-from rdkit.Chem.Draw import MolToImage
-# PIL is imported as Image by rdkit.Chem.Draw.MolToImage, but explicit import is good practice if used directly.
-# from PIL import Image
 import pandas as pd
-import io
-import base64
 import logging
-# Set up logging to monitor quantization effects
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# --- Quantization Configuration ---
-def get_quantization_config():
-    """
-    Configure 8-bit quantization for model optimization.
-    Falls back gracefully if bitsandbytes is not available.
-    """
     try:
-        # 8-bit quantization configuration - good balance of speed and quality
         quantization_config = BitsAndBytesConfig(
             load_in_8bit=True,
-            bnb_8bit_compute_dtype=torch.float16,
-            bnb_8bit_use_double_quant=True,  # Nested quantization for better compression
         )
-        logger.info("8-bit quantization configuration loaded successfully")
-        return quantization_config
     except ImportError:
-        logger.warning("bitsandbytes not available, falling back to standard loading")
-        return None
-    except Exception as e:
-        logger.warning(f"Quantization setup failed: {e}, using standard loading")
-        return None
-def get_torch_dtype():
-    """Get appropriate torch dtype based on available hardware."""
-    if torch.cuda.is_available():
-        return torch.float16  # Use half precision on GPU
-    else:
-        return torch.float32  # Keep full precision on CPU
-# --- Optimized Model Loading ---
-def load_optimized_models():
-    """Load models with quantization and other optimizations."""
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    torch_dtype = get_torch_dtype()
-    quantization_config = get_quantization_config()
-    logger.info(f"Loading models on device: {device} with dtype: {torch_dtype}")
-    # Model names
     model_name = "seyonec/PubChem10M_SMILES_BPE_450k"
-    # Load tokenizer (doesn't need quantization)
-    fill_mask_tokenizer = AutoTokenizer.from_pretrained(model_name)
-    # Load model with quantization if available
-    model_kwargs = {
-        "torch_dtype": torch_dtype,
-    }
-    if quantization_config is not None and torch.cuda.is_available(): # Quantization typically for GPU
         model_kwargs["quantization_config"] = quantization_config
-        # device_map="auto" is often used with bitsandbytes for automatic distribution
         model_kwargs["device_map"] = "auto"
-    elif torch.cuda.is_available():
-        model_kwargs["device_map"] = "auto" # For non-quantized GPU loading
-    else:
-        model_kwargs["device_map"] = None # For CPU
-    try:
-        # Masked LM Model
-        fill_mask_model = AutoModelForMaskedLM.from_pretrained(
-            model_name,
-            **model_kwargs
-        )
-        # Set model to evaluation mode for inference
-        fill_mask_model.eval()
-        # Create optimized pipeline
-        # Let pipeline infer device from model if possible, or set based on model's device
-        pipeline_device = fill_mask_model.device.index if hasattr(fill_mask_model.device, 'type') and fill_mask_model.device.type == "cuda" else -1
-        fill_mask_pipeline = pipeline(
-            'fill-mask',
-            model=fill_mask_model,
-            tokenizer=fill_mask_tokenizer,
-            device=pipeline_device, # Use model's device
-            # torch_dtype=torch_dtype # Pipeline might infer this or it might conflict
-        )
-        logger.info("Models loaded successfully with optimizations")
-        return fill_mask_tokenizer, fill_mask_model, fill_mask_pipeline
-    except Exception as e:
-        logger.error(f"Error loading optimized models: {e}")
-        # Fallback to standard loading
-        logger.info("Falling back to standard model loading...")
-        return load_standard_models(model_name)
-def load_standard_models(model_name):
-    """Fallback standard model loading without quantization."""
-    fill_mask_tokenizer = AutoTokenizer.from_pretrained(model_name)
-    fill_mask_model = AutoModelForMaskedLM.from_pretrained(model_name)
-    # Determine device for standard loading
-    device_idx = 0 if torch.cuda.is_available() else -1
-    fill_mask_pipeline = pipeline('fill-mask', model=fill_mask_model, tokenizer=fill_mask_tokenizer, device=device_idx)
-    if torch.cuda.is_available():
-        fill_mask_model.to("cuda")
-    return fill_mask_tokenizer, fill_mask_model, fill_mask_pipeline
-# Load models with optimizations
-fill_mask_tokenizer, fill_mask_model, fill_mask_pipeline = load_optimized_models()
-# --- Memory Management Utilities ---
-def clear_gpu_cache():
-    """Clear CUDA cache to free up memory."""
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-# --- Helper Functions from Notebook (adapted) ---
 def get_mol(smiles):
-    """Converts SMILES to RDKit Mol object and Kekulizes it."""
     mol = Chem.MolFromSmiles(smiles)
-    if mol is None:
-        return None
-    try:
-        Chem.Kekulize(mol)
-    except: # Kekulization can fail for some structures
-        pass
     return mol
 def find_matches_one(mol, submol_smarts):
@@ -149,149 +144,200 @@ def find_matches_one(mol, submol_smarts):
     matches = mol.GetSubstructMatches(submol)
     return matches
-def get_image_with_highlight(mol, atomset=None, size=(300, 300)):
-    """Draws molecule with optional atom highlighting."""
-    if mol is None:
-        return None
-    highlight_color = (0, 1, 0, 0.5) # Green with some transparency
-    # Ensure atomset contains integers if not None or empty
-    valid_atomset = []
-    if atomset:
         try:
-            valid_atomset = [int(a) for a in atomset]
-        except ValueError:
-            logger.warning(f"Invalid atom in atomset: {atomset}. Proceeding without highlighting problematic atoms.")
-            valid_atomset = [int(a) for a in atomset if str(a).isdigit()] # Filter out non-integers
-    img = MolToImage(mol, size=size, fitImage=True,
-                     highlightAtoms=valid_atomset if valid_atomset else [],
-                     highlightAtomColors={i: highlight_color for i in valid_atomset} if valid_atomset else {})
-    return img
-# --- Optimized Gradio Interface Functions ---
-def predict_and_visualize_masked_smiles(smiles_mask, substructure_smarts_highlight="CC=CC"):
-    """
-    Predicts masked tokens in a SMILES string, shows scores, and visualizes molecules.
-    Optimized with memory management. Returns 7 items for Gradio outputs.
-    """
-    if fill_mask_tokenizer.mask_token not in smiles_mask:
-        # Return 7 items for the 7 output components
-        return pd.DataFrame(), None, None, None, None, None, "Error: Input SMILES must contain a mask token (e.g., <mask>)."
     try:
-        # Use torch.no_grad() for inference to save memory
         with torch.no_grad():
-            predictions = fill_mask_pipeline(smiles_mask, top_k=10) # Get more to filter for valid ones
     except Exception as e:
-        clear_gpu_cache()  # Clear cache on error
-        # Return 7 items
-        return pd.DataFrame(), None, None, None, None, None, f"Error during prediction: {str(e)}"
     results_data = []
-    image_list = []
     valid_predictions_count = 0
-    for pred in predictions:
         if valid_predictions_count >= 5:
             break
         predicted_smiles = pred['sequence']
         score = pred['score']
         mol = get_mol(predicted_smiles)
-        if mol:
-            results_data.append({"Predicted SMILES": predicted_smiles, "Score": f"{score:.4f}"})
-            atom_matches_indices = []
-            if substructure_smarts_highlight:
-                matches = find_matches_one(mol, substructure_smarts_highlight)
-                if matches:
-                    atom_matches_indices = list(matches[0]) # Highlight first match
-            img = get_image_with_highlight(mol, atomset=atom_matches_indices)
-            image_list.append(img)
             valid_predictions_count += 1
-    # Pad image_list if fewer than 5 valid predictions
-    while len(image_list) < 5:
-        image_list.append(None)
     df_results = pd.DataFrame(results_data)
-    # Clear cache after inference
-    clear_gpu_cache()
-    status_message = "Prediction successful." if valid_predictions_count > 0 else "No valid molecules found for top predictions."
-    # Unpack image_list into individual image outputs + df_results + status_message
-    return df_results, image_list[0], image_list[1], image_list[2], image_list[3], image_list[4], status_message
-def display_molecule_image(smiles_string):
-    """
-    Displays a 2D image of a molecule from its SMILES string.
-    """
-    if not smiles_string:
-        return None, "Please enter a SMILES string."
-    mol = get_mol(smiles_string)
-    if mol is None:
-        return None, "Invalid SMILES string."
-    img = MolToImage(mol, size=(400, 400), fitImage=True)
-    return img, "Molecule displayed."
-# --- Gradio Interface Definition ---
-with gr.Blocks(theme=gr.themes.Default()) as demo:
-    gr.Markdown("# ChemBERTa SMILES Utilities Dashboard")
-    with gr.Tab("Masked SMILES Prediction"):
-        gr.Markdown("Enter a SMILES string with a `<mask>` token (e.g., `C1=CC=CC<mask>C1`) to predict possible completions.")
-        with gr.Row():
-            smiles_input_masked = gr.Textbox(label="SMILES String with Mask", value="C1=CC=CC<mask>C1")
-            substructure_input = gr.Textbox(label="Substructure to Highlight (SMARTS)", value="C=C")
-        predict_button_masked = gr.Button("Predict and Visualize")
-        status_masked = gr.Textbox(label="Status", interactive=False)
-        predictions_table = gr.DataFrame(label="Top Predictions & Scores")
-        gr.Markdown("### Predicted Molecule Visualizations (Top 5 Valid)")
-        with gr.Row():
-            img_out_1 = gr.Image(label="Prediction 1", type="pil", interactive=False)
-            img_out_2 = gr.Image(label="Prediction 2", type="pil", interactive=False)
-            img_out_3 = gr.Image(label="Prediction 3", type="pil", interactive=False)
-            img_out_4 = gr.Image(label="Prediction 4", type="pil", interactive=False)
-            img_out_5 = gr.Image(label="Prediction 5", type="pil", interactive=False)
-        # Automatically populate on load for the default example
-        demo.load(
-            lambda: predict_and_visualize_masked_smiles("C1=CC=CC<mask>C1", "C=C"),
-            inputs=None,
-            outputs=[predictions_table, img_out_1, img_out_2, img_out_3, img_out_4, img_out_5, status_masked]
-        )
-        predict_button_masked.click(
-            predict_and_visualize_masked_smiles,
-            inputs=[smiles_input_masked, substructure_input],
-            outputs=[predictions_table, img_out_1, img_out_2, img_out_3, img_out_4, img_out_5, status_masked]
-        )
-    with gr.Tab("Molecule Viewer"):
-        gr.Markdown("Enter a SMILES string to display its 2D structure.")
-        smiles_input_viewer = gr.Textbox(label="SMILES String", value="C1=CC=CC=C1")
-        view_button_molecule = gr.Button("View Molecule")
-        status_viewer = gr.Textbox(label="Status", interactive=False)
-        molecule_image_output = gr.Image(label="Molecule Structure", type="pil", interactive=False)
-        # Automatically populate on load for the default example
-        demo.load(
-            lambda: display_molecule_image("C1=CC=CC=C1"),
-            inputs=None,
-            outputs=[molecule_image_output, status_viewer]
-        )
-        view_button_molecule.click(
-            display_molecule_image,
-            inputs=[smiles_input_viewer],
-            outputs=[molecule_image_output, status_viewer]
-        )
-if __name__ == "__main__":
-    demo.launch()

 # app.py
+import streamlit as st
 import torch
 from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline, BitsAndBytesConfig
 from rdkit import Chem
+from rdkit.Chem import Draw, AllChem
 import pandas as pd
+import py3Dmol
+import re
 import logging
+# Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# --- Page Configuration ---
+st.set_page_config(
+    page_title="ChemBERTa SMILES Utilities",
+    page_icon="🧪",
+    layout="wide",
+)
+# --- Custom Styling (from drug_app) ---
+def apply_custom_styling():
+    st.markdown(
+        """
+        <style>
+        @import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap');
+        html, body, [class*="st-"] {
+            font-family: 'Roboto', sans-serif;
+        }
+        .stApp {
+            background-color: rgb(28, 28, 28);
+            color: white;
+        }
+        /* Tab styles */
+        .stTabs [data-baseweb="tab-list"] {
+            gap: 24px;
+        }
+        .stTabs [data-baseweb="tab"] {
+            height: 50px;
+            white-space: pre-wrap;
+            background: none;
+            border-radius: 0px;
+            border-bottom: 2px solid #333;
+            padding: 10px 4px;
+            color: #AAA;
+        }
+        .stTabs [data-baseweb="tab"]:hover {
+            background: #222;
+            color: #FFF;
+        }
+        .stTabs [aria-selected="true"] {
+            border-bottom: 2px solid #00A0FF; /* Highlight color for active tab */
+            color: #FFF;
+        }
+        /* Button styles */
+        .stButton>button {
+            border-color: #00A0FF;
+            color: #00A0FF;
+            background-color: transparent;
+        }
+        .stButton>button:hover {
+            border-color: #FFF;
+            color: #FFF;
+            background-color: #00A0FF;
+        }
+        </style>
+        """,
+        unsafe_allow_html=True
+    )
+apply_custom_styling()
+# --- Model Loading (from mol_app) ---
+@st.cache_resource(show_spinner="Loading ChemBERTa model...")
+def load_optimized_models():
+    """Load models with quantization and other optimizations."""
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
     try:
         quantization_config = BitsAndBytesConfig(
             load_in_8bit=True,
+            bnb_8bit_compute_dtype=torch_dtype,
+            bnb_8bit_use_double_quant=True,
         )
+        logger.info("8-bit quantization will be used.")
     except ImportError:
+        quantization_config = None
+        logger.warning("bitsandbytes not found. Model will be loaded without quantization.")
     model_name = "seyonec/PubChem10M_SMILES_BPE_450k"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model_kwargs = {"torch_dtype": torch_dtype}
+    if quantization_config and torch.cuda.is_available():
         model_kwargs["quantization_config"] = quantization_config
         model_kwargs["device_map"] = "auto"
+    model = AutoModelForMaskedLM.from_pretrained(model_name, **model_kwargs)
+    pipe = pipeline(
+        'fill-mask',
+        model=model,
+        tokenizer=tokenizer,
+        device=0 if device == "cuda" else -1
+    )
+    logger.info("ChemBERTa model loaded successfully.")
+    return pipe, tokenizer
+fill_mask_pipeline, tokenizer = load_optimized_models()
+# --- Core Functions ---
 def get_mol(smiles):
+    """Converts SMILES to RDKit Mol object."""
     mol = Chem.MolFromSmiles(smiles)
+    if mol:
+        try:
+            Chem.Kekulize(mol)
+        except:
+            pass
     return mol
 def find_matches_one(mol, submol_smarts):
     matches = mol.GetSubstructMatches(submol)
     return matches
+# --- Visualization Function (Adapted from drug_app) ---
+def visualize_molecule_2d_3d(smiles: str, name: str, substructure_smarts=""):
+    """Generates a side-by-side 2D SVG and 3D py3Dmol HTML view for a single molecule."""
+    log = ""
+    try:
+        mol = get_mol(smiles)
+        if not mol:
+            return f"<p>Invalid SMILES for {name}</p>", f"❌ Invalid SMILES for {name}"
+        # --- 2D Visualization ---
+        drawer = Draw.rdMolDraw2D.MolDraw2DSVG(450, 350)
+        opts = drawer.drawOptions()
+        opts.clearBackground = False
+        opts.addStereoAnnotation = True
+        opts.baseFontSize = 0.9
+        # Highlighting
+        atom_indices_to_highlight = []
+        if substructure_smarts:
+            matches = find_matches_one(mol, substructure_smarts)
+            if matches:
+                atom_indices_to_highlight = list(matches[0]) # Highlight first match
+        # Dark theme colors for 2D drawing
+        opts.backgroundColour = (0.109, 0.109, 0.109) # rgb(28,28,28)
+        opts.symbolColour = (1, 1, 1)
+        opts.setAtomPalette({
+            -1: (1, 1, 1), # Default
+             6: (0.9, 0.9, 0.9), # Carbon
+             7: (0.5, 0.5, 1),   # Nitrogen
+             8: (1, 0.2, 0.2),   # Oxygen
+            16: (1, 0.8, 0.2),   # Sulfur
+        })
+        drawer.DrawMolecule(mol, highlightAtoms=atom_indices_to_highlight)
+        drawer.FinishDrawing()
+        svg_2d = drawer.GetDrawingText()
+        # Fix colors for dark theme
+        svg_2d = svg_2d.replace('stroke="black"', 'stroke="white"')
+        svg_2d = svg_2d.replace('fill="black"', 'fill="white"')
+        svg_2d = re.sub(r'fill:#(000000|000);', 'fill:white;', svg_2d)
+        # --- 3D Visualization ---
+        mol_3d = Chem.AddHs(mol)
+        AllChem.EmbedMolecule(mol_3d, randomSeed=42)
         try:
+            AllChem.MMFFOptimizeMolecule(mol_3d)
+        except:
+            AllChem.ETKDGv3().Embed(mol_3d)
+        sdf_data = Chem.MolToMolBlock(mol_3d)
+        viewer = py3Dmol.view(width=450, height=350)
+        viewer.setBackgroundColor('#1C1C1C')
+        viewer.addModel(sdf_data, "sdf")
+        viewer.setStyle({'stick': {}, 'sphere': {'scale': 0.25}})
+        viewer.zoomTo()
+        html_3d = viewer._make_html()
+        # --- Combine Views ---
+        combined_html = f"""
+        <div style="display: flex; flex-direction: row; align-items: center; justify-content: space-around; border: 1px solid #444; border-radius: 10px; padding: 10px; margin-bottom: 20px; background-color: #2b2b2b;">
+            <div style="text-align: center;">
+                <h4 style="color: white; font-family: 'Roboto', sans-serif;">{name} (2D Structure)</h4>
+                <div style="background-color: #1C1C1C; padding: 10px; border-radius: 5px;">{svg_2d}</div>
+            </div>
+            <div style="text-align: center;">
+                <h4 style="color: white; font-family: 'Roboto', sans-serif;">{name} (3D Interactive)</h4>
+                {html_3d}
+            </div>
+        </div>
+        """
+        log += f"✅ Generated 2D/3D view for {name}.\n"
+        return combined_html, log
+    except Exception as e:
+        return f"<p>Error visualizing {name}: {e}</p>", f"❌ Error visualizing {name}: {e}"
+# --- Main Application Logic ---
+def predict_and_generate_visualizations(smiles_mask, substructure_smarts):
+    """Predicts masked SMILES and returns a dataframe and HTML for visualizations."""
+    if tokenizer.mask_token not in smiles_mask:
+        st.error(f"Error: Input SMILES must contain a mask token (e.g., `{tokenizer.mask_token}`).")
+        return pd.DataFrame(), "", "Input error."
+    status_log = ""
     try:
         with torch.no_grad():
+            predictions = fill_mask_pipeline(smiles_mask, top_k=15)
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
     except Exception as e:
+        st.error(f"An error occurred during model prediction: {e}")
+        return pd.DataFrame(), "", "Prediction error."
     results_data = []
+    combined_html = ""
     valid_predictions_count = 0
+    for i, pred in enumerate(predictions):
         if valid_predictions_count >= 5:
             break
         predicted_smiles = pred['sequence']
         score = pred['score']
         mol = get_mol(predicted_smiles)
+        if mol:
             valid_predictions_count += 1
+            results_data.append({
+                "Rank": valid_predictions_count,
+                "Predicted SMILES": predicted_smiles,
+                "Score": f"{score:.4f}"
+            })
+            html_view, log = visualize_molecule_2d_3d(
+                predicted_smiles,
+                f"Prediction #{valid_predictions_count}",
+                substructure_smarts
+            )
+            combined_html += html_view
+            status_log += log
     df_results = pd.DataFrame(results_data)
+    status_log += f"\nFound {valid_predictions_count} valid molecules from top predictions."
+    return df_results, combined_html, status_log
+# --- Streamlit Interface ---
+st.title("🧪 ChemBERTa SMILES Utilities")
+st.markdown("""
+Enter a SMILES string with a `<mask>` token (e.g., `C1=CC=CC<mask>C1`) to predict possible completions.
+The model will generate the most likely atoms or fragments to fill the mask.
+""")
+tab1, tab2 = st.tabs(["Masked SMILES Prediction", "Molecule Viewer"])
+with tab1:
+    st.header("Masked SMILES Prediction")
+    with st.form("prediction_form"):
+        col1, col2 = st.columns(2)
+        with col1:
+            smiles_input_masked = st.text_input(
+                "SMILES String with Mask",
+                value=f"C1=CC=CC{tokenizer.mask_token}C1",
+                help=f"Use `{tokenizer.mask_token}` as the mask token."
+            )
+        with col2:
+            substructure_input = st.text_input(
+                "Substructure to Highlight (SMARTS)",
+                value="C=C",
+                help="Enter a SMARTS pattern to highlight in the 2D view."
+            )
+        submit_button = st.form_submit_button("🚀 Predict and Visualize", use_container_width=True)
+    if 'results_df' not in st.session_state or submit_button:
+        if submit_button or 'results_df' not in st.session_state:
+            with st.spinner("Running predictions... This may take a moment."):
+                df, html, log = predict_and_generate_visualizations(smiles_input_masked, substructure_input)
+                st.session_state.results_df = df
+                st.session_state.results_html = html
+                st.session_state.status_log = log
+    st.subheader("Top Predictions & Scores")
+    if 'results_df' in st.session_state and not st.session_state.results_df.empty:
+        st.dataframe(st.session_state.results_df, use_container_width=True, hide_index=True)
+        st.subheader("Predicted Molecule Visualizations (Top 5 Valid)")
+        st.components.v1.html(st.session_state.results_html, height=1850, scrolling=True)
+    else:
+        st.info("No valid predictions to display. Try a different input.")
+    with st.expander("Show Logs"):
+        if 'status_log' in st.session_state:
+            st.text_area("", st.session_state.status_log, height=200, key="log_area_pred")
+with tab2:
+    st.header("Molecule Viewer")
+    st.markdown("Enter a single SMILES string to display its 2D and 3D structure.")
+    with st.form("viewer_form"):
+        smiles_input_viewer = st.text_input("SMILES String", value="CC(=O)Oc1ccccc1C(=O)O") # Aspirin
+        viewer_submit = st.form_submit_button("👁️ View Molecule", use_container_width=True)
+    if viewer_submit:
+        with st.spinner("Generating visualization..."):
+            html_view, log = visualize_molecule_2d_3d(smiles_input_viewer, "Molecule")
+            st.session_state.viewer_html = html_view
+            st.session_state.viewer_log = log
+    if 'viewer_html' in st.session_state:
+        st.components.v1.html(st.session_state.viewer_html, height=450)
+    with st.expander("Show Logs"):
+        if 'viewer_log' in st.session_state:
+            st.text_area("", st.session_state.viewer_log, height=100, key="log_area_view")