Spaces:

alidenewade
/

mol-lang-lab

Sleeping

App Files Files Community

alidenewade commited on May 28

Commit

1850745

verified ·

1 Parent(s): d81a373

Create app.py

Browse files

Files changed (1) hide show

app.py +234 -0

app.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# app.py
+import gradio as gr
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline, RobertaModel, RobertaTokenizer
+from rdkit import Chem
+from rdkit.Chem import Draw, rdFMCS
+from rdkit.Chem.Draw import MolToImage
+from PIL importImage
+import pandas as pd
+from bertviz import head_view
+from IPython.core.display import HTML
+import io
+import base64
+# --- Model and Tokenizer Loading ---
+# Masked LM Model
+fill_mask_model_name = "seyonec/PubChem10M_SMILES_BPE_450k"
+fill_mask_tokenizer = AutoTokenizer.from_pretrained(fill_mask_model_name)
+fill_mask_model = AutoModelForMaskedLM.from_pretrained(fill_mask_model_name)
+fill_mask_pipeline = pipeline('fill-mask', model=fill_mask_model, tokenizer=fill_mask_tokenizer)
+# Roberta Model for Attention
+attention_model_name = 'seyonec/PubChem10M_SMILES_BPE_450k' # Can be same or different as needed
+attention_model = RobertaModel.from_pretrained(attention_model_name, output_attentions=True)
+attention_tokenizer = RobertaTokenizer.from_pretrained(attention_model_name)
+# --- Helper Functions from Notebook (adapted) ---
+def get_mol(smiles):
+    """Converts SMILES to RDKit Mol object and Kekulizes it."""
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return None
+    try:
+        Chem.Kekulize(mol)
+    except: # Kekulization can fail for some structures
+        pass
+    return mol
+def find_matches_one(mol, submol_smarts):
+    """Finds all matching atoms for a SMARTS pattern in a molecule."""
+    if not mol or not submol_smarts:
+        return []
+    submol = Chem.MolFromSmarts(submol_smarts)
+    if not submol:
+        return []
+    matches = mol.GetSubstructMatches(submol)
+    return matches
+def get_image_with_highlight(mol, atomset=None, size=(300, 300)):
+    """Draws molecule with optional atom highlighting."""
+    if mol is None:
+        return None
+    highlight_color = (0, 1, 0, 0.5) # Green with some transparency
+    img = MolToImage(mol, size=size, fitImage=True,
+                     highlightAtoms=atomset if atomset else [],
+                     highlightAtomColors={i: highlight_color for i in atomset} if atomset else {})
+    return img
+# --- Gradio Interface Functions ---
+def predict_and_visualize_masked_smiles(smiles_mask, substructure_smarts_highlight="CC=CC"):
+    """
+    Predicts masked tokens in a SMILES string, shows scores, and visualizes molecules.
+    """
+    if fill_mask_tokenizer.mask_token not in smiles_mask:
+        return pd.DataFrame(), [None]*5, "Error: Input SMILES must contain a mask token (e.g., <mask>)."
+    try:
+        predictions = fill_mask_pipeline(smiles_mask, top_k=10) # Get more to filter for valid ones
+    except Exception as e:
+        return pd.DataFrame(), [None]*5, f"Error during prediction: {str(e)}"
+    results_data = []
+    image_list = []
+    valid_predictions_count = 0
+    for pred in predictions:
+        if valid_predictions_count >= 5:
+            break
+        predicted_smiles = pred['sequence']
+        score = pred['score']
+        mol = get_mol(predicted_smiles)
+        if mol:
+            results_data.append({"Predicted SMILES": predicted_smiles, "Score": f"{score:.4f}"})
+            atom_matches = []
+            if substructure_smarts_highlight:
+                matches = find_matches_one(mol, substructure_smarts_highlight)
+                if matches:
+                    atom_matches = list(matches[0]) # Highlight first match
+            img = get_image_with_highlight(mol, atomset=atom_matches)
+            image_list.append(img)
+            valid_predictions_count += 1
+    # Pad image_list if fewer than 5 valid predictions
+    while len(image_list) < 5:
+        image_list.append(None)
+    df_results = pd.DataFrame(results_data)
+    return df_results, image_list, "Prediction successful." if valid_predictions_count > 0 else "No valid molecules found for top predictions."
+def visualize_attention_bertviz(sentence_a, sentence_b):
+    """
+    Generates and displays BertViz attention head view as HTML.
+    """
+    if not sentence_a or not sentence_b:
+        return "Please provide two SMILES strings."
+    try:
+        inputs = attention_tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True)
+        input_ids = inputs['input_ids']
+        # Ensure model is in eval mode and no_grad for inference
+        attention_model.eval()
+        with torch.no_grad():
+            attention_outputs = attention_model(input_ids)
+        attention = attention_outputs[-1] # Last item in the tuple is attentions
+        input_id_list = input_ids[0].tolist()
+        tokens = attention_tokenizer.convert_ids_to_tokens(input_id_list)
+        html_object = head_view(attention, tokens, display_mode="light") # Use light mode for better Gradio compatibility
+        # Extract HTML string from the IPython.core.display.HTML object
+        html_string = html_object.data
+        # Embed JavaScript directly if needed, or ensure Gradio's HTML component handles it.
+        # BertViz often requires D3.js and jQuery. Gradio's HTML component might not execute all JS.
+        # For robustness, it's better if head_view produces self-contained HTML or if Gradio supports JS execution.
+        # A common workaround is to serve the HTML and use an iframe, or save to file and link.
+        # Here, we'll return the raw HTML string and let Gradio's gr.HTML handle it.
+        # Add D3 and jQuery CDN links to the HTML string for better rendering in Gradio
+        # This is a common workaround if Gradio's HTML component doesn't include these by default
+        # Note: This might still have limitations depending on Gradio's sandboxing.
+        html_with_deps = f"""
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.0/jquery.min.js"></script>
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min.js"></script>
+        {html_string}
+        """
+        return html_with_deps
+    except Exception as e:
+        return f"Error generating attention visualization: {str(e)}"
+def display_molecule_image(smiles_string):
+    """
+    Displays a 2D image of a molecule from its SMILES string.
+    """
+    if not smiles_string:
+        return None, "Please enter a SMILES string."
+    mol = get_mol(smiles_string)
+    if mol is None:
+        return None, "Invalid SMILES string."
+    img = MolToImage(mol, size=(400, 400), fitImage=True)
+    return img, "Molecule displayed."
+# --- Gradio Interface Definition ---
+with gr.Blocks(theme=gr.themes.Default()) as demo:
+    gr.Markdown("# ChemBERTa SMILES Utilities Dashboard")
+    with gr.Tab("Masked SMILES Prediction"):
+        gr.Markdown("Enter a SMILES string with a `<mask>` token (e.g., `C1=CC=CC<mask>C1`) to predict possible completions.")
+        with gr.Row():
+            smiles_input_masked = gr.Textbox(label="SMILES String with Mask", value="C1=CC=CC<mask>C1")
+            substructure_input = gr.Textbox(label="Substructure to Highlight (SMARTS)", value="C=C")
+        predict_button_masked = gr.Button("Predict and Visualize")
+        status_masked = gr.Textbox(label="Status", interactive=False)
+        predictions_table = gr.DataFrame(label="Top Predictions & Scores")
+        gr.Markdown("### Predicted Molecule Visualizations (Top 5 Valid)")
+        with gr.Row():
+            img_out_1 = gr.Image(label="Prediction 1", type="pil", interactive=False)
+            img_out_2 = gr.Image(label="Prediction 2", type="pil", interactive=False)
+            img_out_3 = gr.Image(label="Prediction 3", type="pil", interactive=False)
+            img_out_4 = gr.Image(label="Prediction 4", type="pil", interactive=False)
+            img_out_5 = gr.Image(label="Prediction 5", type="pil", interactive=False)
+        # Automatically populate on load for the default example
+        demo.load(
+            lambda: predict_and_visualize_masked_smiles("C1=CC=CC<mask>C1", "C=C"),
+            inputs=None,
+            outputs=[predictions_table, img_out_1, img_out_2, img_out_3, img_out_4, img_out_5, status_masked]
+        )
+        predict_button_masked.click(
+            predict_and_visualize_masked_smiles,
+            inputs=[smiles_input_masked, substructure_input],
+            outputs=[predictions_table, img_out_1, img_out_2, img_out_3, img_out_4, img_out_5, status_masked]
+        )
+    with gr.Tab("Attention Visualization"):
+        gr.Markdown("Enter two SMILES strings to visualize attention between them using BertViz. This may take a moment to render.")
+        with gr.Row():
+            smiles_a_input_attn = gr.Textbox(label="SMILES String A", value="CCCCC[C@@H](Br)CC")
+            smiles_b_input_attn = gr.Textbox(label="SMILES String B", value="CCCCC[C@H](Br)CC")
+        visualize_button_attn = gr.Button("Visualize Attention")
+        attention_html_output = gr.HTML(label="Attention Head View")
+        # Automatically populate on load for the default example
+        demo.load(
+            lambda: visualize_attention_bertviz("CCCCC[C@@H](Br)CC", "CCCCC[C@H](Br)CC"),
+            inputs=None,
+            outputs=[attention_html_output]
+        )
+        visualize_button_attn.click(
+            visualize_attention_bertviz,
+            inputs=[smiles_a_input_attn, smiles_b_input_attn],
+            outputs=[attention_html_output]
+        )
+    with gr.Tab("Molecule Viewer"):
+        gr.Markdown("Enter a SMILES string to display its 2D structure.")
+        smiles_input_viewer = gr.Textbox(label="SMILES String", value="C1=CC=CC=C1")
+        view_button_molecule = gr.Button("View Molecule")
+        status_viewer = gr.Textbox(label="Status", interactive=False)
+        molecule_image_output = gr.Image(label="Molecule Structure", type="pil", interactive=False)
+        # Automatically populate on load for the default example
+        demo.load(
+            lambda: display_molecule_image("C1=CC=CC=C1"),
+            inputs=None,
+            outputs=[molecule_image_output, status_viewer]
+        )
+        view_button_molecule.click(
+            display_molecule_image,
+            inputs=[smiles_input_viewer],
+            outputs=[molecule_image_output, status_viewer]
+        )
+if __name__ == "__main__":
+    demo.launch()