|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
import pandas as pd |
|
from rdkit import Chem |
|
from rdkit.Chem import AllChem |
|
from rdkit import DataStructs |
|
from rdkit.Chem import Descriptors |
|
import selfies as sf |
|
from rdkit.Chem import RDConfig |
|
import os |
|
import sys |
|
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score')) |
|
import sascorer |
|
|
|
def get_largest_ring_size(mol): |
|
cycle_list = mol.GetRingInfo().AtomRings() |
|
if cycle_list: |
|
cycle_length = max([len(j) for j in cycle_list]) |
|
else: |
|
cycle_length = 0 |
|
return cycle_length |
|
|
|
def plogp(smile): |
|
if smile: |
|
mol = Chem.MolFromSmiles(smile) |
|
if mol: |
|
log_p = Descriptors.MolLogP(mol) |
|
sas_score = sascorer.calculateScore(mol) |
|
largest_ring_size = get_largest_ring_size(mol) |
|
cycle_score = max(largest_ring_size - 6, 0) |
|
if log_p and sas_score and largest_ring_size: |
|
p_logp = log_p - sas_score - cycle_score |
|
return p_logp |
|
else: |
|
return -100 |
|
else: |
|
return -100 |
|
else: |
|
return -100 |
|
|
|
def sf_decode(selfies): |
|
try: |
|
decode = sf.decoder(selfies) |
|
return decode |
|
except sf.DecoderError: |
|
return '' |
|
|
|
def sim(input_smile, output_smile): |
|
if input_smile and output_smile: |
|
input_mol = Chem.MolFromSmiles(input_smile) |
|
output_mol = Chem.MolFromSmiles(output_smile) |
|
if input_mol and output_mol: |
|
input_fp = AllChem.GetMorganFingerprint(input_mol, 2) |
|
output_fp = AllChem.GetMorganFingerprint(output_mol, 2) |
|
sim = DataStructs.TanimotoSimilarity(input_fp, output_fp) |
|
return sim |
|
else: return None |
|
else: return None |
|
|
|
|
|
def greet(name): |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("zjunlp/MolGen-large-opt") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("zjunlp/MolGen-large-opt") |
|
|
|
input = name |
|
|
|
sf_input = tokenizer(input, return_tensors="pt") |
|
molecules = model.generate( |
|
input_ids=sf_input["input_ids"], |
|
attention_mask=sf_input["attention_mask"], |
|
do_sample=True, |
|
max_length=100, |
|
min_length=5, |
|
top_k=30, |
|
top_p=1, |
|
num_return_sequences=10 |
|
) |
|
sf_output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True).replace(" ","") for g in molecules] |
|
sf_output = list(set(sf_output)) |
|
input_sm = sf_decode(input) |
|
sm_output = [sf_decode(sf) for sf in sf_output] |
|
|
|
|
|
|
|
input_plogp = plogp(input_sm) |
|
plogp_improve = [plogp(i)-input_plogp for i in sm_output] |
|
|
|
|
|
simm = [sim(i,input_sm) for i in sm_output] |
|
|
|
candidate_selfies = {"candidates": sf_output, "improvement": plogp_improve, "sim": simm} |
|
data = pd.DataFrame(candidate_selfies) |
|
|
|
return data[(data['improvement']> 0) & (data['sim']>0.4)] |
|
|
|
|
|
|
|
|
|
|
|
|
|
examples = [ |
|
|
|
['[C][C][=Branch1][C][=O][N][C][C][O][C][C][O][C][C][O][C][C][Ring1][N]'],['[C][C][S][C][C][S][C][C][C][S][C][C][S][C][Ring1][=C]'] |
|
|
|
] |
|
|
|
|
|
|
|
iface = gr.Interface(fn=greet, inputs="text", outputs="numpy", title="Molecular Language Model as Multi-task Generator",examples=examples) |
|
iface.launch() |