Spaces:
Running
Running
v2 init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Dockerfile +13 -0
- Dockerfile-conda +13 -0
- README.md +4 -4
- app.py +489 -333
- data/lce/test.csv +31 -0
- data/lce/test_data.csv +14 -0
- data/lce/train.csv +121 -0
- data/lce/train_data.csv +148 -0
- models/.DS_Store +0 -0
- models/.gitattributes +3 -0
- models/__pycache__/fm4m.cpython-310.pyc +0 -0
- models/fm4m.py +366 -74
- models/mhg_model/README.md +1 -1
- models/mhg_model/images/mhg_example.png +0 -0
- models/mhg_model/images/mhg_example1.png +0 -0
- models/mhg_model/images/mhg_example2.png +0 -0
- models/mhg_model/load.py +20 -1
- models/mhg_model/paper/MHG-GNN_Combination of Molecular Hypergraph Grammar with Graph Neural Network.pdf +0 -0
- models/selfies_model/selfies-ted.png +0 -0
- models/selfies_ted/README.md +87 -0
- models/selfies_ted/load.py +92 -0
- models/selfies_ted/requirements.txt +12 -0
- models/selfies_ted/selfies-ted-example.ipynb +136 -0
- models/selfies_ted/selfies-ted.png +3 -0
- models/smi_ted/.gitignore +18 -0
- models/smi_ted/README.md +138 -0
- models/smi_ted/finetune/args.py +337 -0
- models/smi_ted/finetune/finetune_classification.py +68 -0
- models/smi_ted/finetune/finetune_classification_multitask.py +101 -0
- models/smi_ted/finetune/finetune_regression.py +70 -0
- models/smi_ted/finetune/moleculenet/bace/test.csv +3 -0
- models/smi_ted/finetune/moleculenet/bace/train.csv +3 -0
- models/smi_ted/finetune/moleculenet/bace/valid.csv +3 -0
- models/smi_ted/finetune/moleculenet/bbbp/test.csv +3 -0
- models/smi_ted/finetune/moleculenet/bbbp/train.csv +3 -0
- models/smi_ted/finetune/moleculenet/bbbp/valid.csv +3 -0
- models/smi_ted/finetune/moleculenet/biodegradability/biodeg_example.csv +3 -0
- models/smi_ted/finetune/moleculenet/biodegradability/biodegradability.csv +3 -0
- models/smi_ted/finetune/moleculenet/biodegradability/test.csv +3 -0
- models/smi_ted/finetune/moleculenet/biodegradability/train.csv +3 -0
- models/smi_ted/finetune/moleculenet/biodegradability/valid.csv +3 -0
- models/smi_ted/finetune/moleculenet/clintox/test.csv +3 -0
- models/smi_ted/finetune/moleculenet/clintox/train.csv +3 -0
- models/smi_ted/finetune/moleculenet/clintox/valid.csv +3 -0
- models/smi_ted/finetune/moleculenet/esol/test.csv +3 -0
- models/smi_ted/finetune/moleculenet/esol/train.csv +3 -0
- models/smi_ted/finetune/moleculenet/esol/valid.csv +3 -0
- models/smi_ted/finetune/moleculenet/freesolv/test.csv +3 -0
- models/smi_ted/finetune/moleculenet/freesolv/train.csv +3 -0
- models/smi_ted/finetune/moleculenet/freesolv/valid.csv +3 -0
Dockerfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9.7
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
COPY requirements.txt .
|
| 5 |
+
RUN pip install -r requirements.txt
|
| 6 |
+
# preload models
|
| 7 |
+
RUN python -c '\
|
| 8 |
+
from transformers import BartForConditionalGeneration, AutoTokenizer;\
|
| 9 |
+
AutoTokenizer.from_pretrained("ibm/materials.selfies-ted");\
|
| 10 |
+
BartForConditionalGeneration.from_pretrained("ibm/materials.selfies-ted")'
|
| 11 |
+
COPY . .
|
| 12 |
+
|
| 13 |
+
CMD ["python", "app.py"]
|
Dockerfile-conda
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM condaforge/miniforge3
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
SHELL ["/bin/bash", "-i", "-c"]
|
| 5 |
+
RUN apt-get update && \
|
| 6 |
+
apt-get install -y build-essential libxrender1 libxext-dev
|
| 7 |
+
RUN conda create --name fm4m python=3.9.7
|
| 8 |
+
RUN conda activate fm4m
|
| 9 |
+
COPY requirements.txt .
|
| 10 |
+
RUN pip install -r requirements.txt
|
| 11 |
+
COPY . .
|
| 12 |
+
|
| 13 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
---
|
| 2 |
-
title: Fm4m
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.4.0
|
| 8 |
app_file: app.py
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Fix Fm4m Kit
|
| 3 |
+
emoji: 🐢
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.4.0
|
| 8 |
app_file: app.py
|
app.py
CHANGED
|
@@ -1,142 +1,103 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from huggingface_hub import InferenceClient
|
| 3 |
import matplotlib.pyplot as plt
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
from rdkit.Chem.Crippen import MolLogP
|
| 7 |
import pandas as pd
|
| 8 |
-
|
| 9 |
-
from rdkit.Chem import DataStructs, AllChem
|
| 10 |
-
from transformers import BartForConditionalGeneration, AutoTokenizer, AutoModel
|
| 11 |
-
from transformers.modeling_outputs import BaseModelOutput
|
| 12 |
import selfies as sf
|
| 13 |
-
from rdkit import Chem
|
| 14 |
import torch
|
| 15 |
-
import numpy as np
|
| 16 |
-
import umap
|
| 17 |
-
import pickle
|
| 18 |
import xgboost as xgb
|
| 19 |
-
from
|
| 20 |
-
from
|
|
|
|
|
|
|
|
|
|
| 21 |
from sklearn.kernel_ridge import KernelRidge
|
| 22 |
-
import
|
| 23 |
-
|
| 24 |
-
import
|
|
|
|
| 25 |
|
| 26 |
os.environ["OMP_MAX_ACTIVE_LEVELS"] = "1"
|
| 27 |
|
| 28 |
-
# my_theme = gr.Theme.from_hub("ysharma/steampunk")
|
| 29 |
-
# my_theme = gr.themes.Glass()
|
| 30 |
-
|
| 31 |
-
"""
|
| 32 |
-
# カスタムテーマ設定
|
| 33 |
-
theme = gr.themes.Default().set(
|
| 34 |
-
body_background_fill="#000000", # 背景色を黒に設定
|
| 35 |
-
text_color="#FFFFFF", # テキスト色を白に設定
|
| 36 |
-
)
|
| 37 |
-
"""
|
| 38 |
-
"""
|
| 39 |
-
import sys
|
| 40 |
-
sys.path.append("models")
|
| 41 |
-
sys.path.append("../models")
|
| 42 |
-
sys.path.append("../")"""
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
# Get the current file's directory
|
| 46 |
-
base_dir = os.path.dirname(__file__)
|
| 47 |
-
print("Base Dir : ", base_dir)
|
| 48 |
-
|
| 49 |
import models.fm4m as fm4m
|
| 50 |
|
|
|
|
|
|
|
| 51 |
|
| 52 |
# Function to display molecule image from SMILES
|
| 53 |
def smiles_to_image(smiles):
|
| 54 |
mol = Chem.MolFromSmiles(smiles)
|
| 55 |
-
if mol
|
| 56 |
-
img = Draw.MolToImage(mol)
|
| 57 |
-
return img
|
| 58 |
-
return None
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
# Function to get canonical SMILES
|
| 62 |
-
def get_canonical_smiles(smiles):
|
| 63 |
-
mol = Chem.MolFromSmiles(smiles)
|
| 64 |
-
if mol:
|
| 65 |
-
return Chem.MolToSmiles(mol, canonical=True)
|
| 66 |
-
return None
|
| 67 |
|
| 68 |
|
| 69 |
# Dictionary for SMILES strings and corresponding images (you can replace with your actual image paths)
|
| 70 |
smiles_image_mapping = {
|
| 71 |
-
"Mol 1": {
|
|
|
|
|
|
|
|
|
|
| 72 |
# Example SMILES for ethanol
|
| 73 |
-
"Mol 2": {
|
|
|
|
|
|
|
|
|
|
| 74 |
# Example SMILES for butane
|
| 75 |
-
"Mol 3": {
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
# Example SMILES for diethyl ether
|
| 79 |
-
"Mol 5": {
|
|
|
|
|
|
|
|
|
|
| 80 |
}
|
| 81 |
|
| 82 |
datasets = [" ", "BACE", "ESOL", "Load Custom Dataset"]
|
| 83 |
|
| 84 |
-
models_enabled = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
fusion_available = ["Concat"]
|
| 87 |
|
| 88 |
-
global log_df
|
| 89 |
-
log_df = pd.DataFrame(columns=["Selected Models", "Dataset", "Task", "Result"])
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
def log_selection(models, dataset, task_type, result, log_df):
|
| 93 |
-
# Append the new entry to the DataFrame
|
| 94 |
-
new_entry = {"Selected Models": str(models), "Dataset": dataset, "Task": task_type, "Result": result}
|
| 95 |
-
updated_log_df = log_df.append(new_entry, ignore_index=True)
|
| 96 |
-
return updated_log_df
|
| 97 |
-
|
| 98 |
|
| 99 |
# Function to handle evaluation and logging
|
| 100 |
-
def
|
| 101 |
-
return
|
| 102 |
-
def evaluate_and_log(models, dataset, task_type, eval_output):
|
| 103 |
task_dic = {'Classification': 'CLS', 'Regression': 'RGR'}
|
| 104 |
-
result = f"{eval_output}"
|
| 105 |
result = result.replace(" Score", "")
|
| 106 |
|
| 107 |
-
new_entry = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
new_entry_df = pd.DataFrame([new_entry])
|
| 109 |
|
| 110 |
-
log_df = pd.
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
log_df.to_csv('log.csv')
|
| 114 |
-
|
| 115 |
-
return log_df
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
try:
|
| 119 |
-
log_df = pd.read_csv('log.csv', index_col=0)
|
| 120 |
-
except:
|
| 121 |
-
log_df = pd.DataFrame({"":[],
|
| 122 |
-
'Selected Models': [],
|
| 123 |
-
'Dataset': [],
|
| 124 |
-
'Task': [],
|
| 125 |
-
'Result': []
|
| 126 |
-
})
|
| 127 |
-
csv_file_path = 'log.csv'
|
| 128 |
-
log_df.to_csv(csv_file_path, index=False)
|
| 129 |
|
| 130 |
|
| 131 |
# Load images for selection
|
| 132 |
def load_image(path):
|
| 133 |
try:
|
| 134 |
-
return Image.open(smiles_image_mapping[path]["image"])
|
| 135 |
except:
|
| 136 |
pass
|
| 137 |
|
| 138 |
|
| 139 |
-
|
| 140 |
# Function to handle image selection
|
| 141 |
def handle_image_selection(image_key):
|
| 142 |
smiles = smiles_image_mapping[image_key]["smiles"]
|
|
@@ -160,59 +121,55 @@ def calculate_tanimoto(smiles1, smiles2):
|
|
| 160 |
mol1 = Chem.MolFromSmiles(smiles1)
|
| 161 |
mol2 = Chem.MolFromSmiles(smiles2)
|
| 162 |
if mol1 and mol2:
|
| 163 |
-
# fp1 = FingerprintMols.FingerprintMol(mol1)
|
| 164 |
-
# fp2 = FingerprintMols.FingerprintMol(mol2)
|
| 165 |
fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2)
|
| 166 |
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2)
|
| 167 |
return round(DataStructs.FingerprintSimilarity(fp1, fp2), 2)
|
| 168 |
return None
|
| 169 |
|
| 170 |
|
| 171 |
-
#with open("models/selfies_model/bart-2908.pickle", "rb") as input_file:
|
| 172 |
-
# gen_model, gen_tokenizer = pickle.load(input_file)
|
| 173 |
-
|
| 174 |
gen_tokenizer = AutoTokenizer.from_pretrained("ibm/materials.selfies-ted")
|
| 175 |
gen_model = BartForConditionalGeneration.from_pretrained("ibm/materials.selfies-ted")
|
| 176 |
|
| 177 |
|
| 178 |
def generate(latent_vector, mask):
|
| 179 |
encoder_outputs = BaseModelOutput(latent_vector)
|
| 180 |
-
decoder_output = gen_model.generate(
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
selfies = gen_tokenizer.batch_decode(decoder_output, skip_special_tokens=True)
|
| 183 |
-
|
| 184 |
-
for i in selfies:
|
| 185 |
-
try:
|
| 186 |
-
print("Generated SELFIES : ", i)
|
| 187 |
-
decoded = sf.decoder(i.replace("] [", "]["))
|
| 188 |
-
print("Generated SMILES : ", decoded)
|
| 189 |
-
outs.append(decoded)
|
| 190 |
-
#except selfies.exceptions.DecoderError:
|
| 191 |
-
# print(f"Error decoding SELFIES string: {i}")
|
| 192 |
-
except:
|
| 193 |
-
pass
|
| 194 |
-
|
| 195 |
-
#outs.append(sf.decoder(i.replace("] [", "][")))
|
| 196 |
-
return outs
|
| 197 |
|
| 198 |
|
| 199 |
def perturb_latent(latent_vecs, noise_scale=0.5):
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
|
| 205 |
def encode(selfies):
|
| 206 |
-
encoding = gen_tokenizer(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
input_ids = encoding['input_ids']
|
| 208 |
attention_mask = encoding['attention_mask']
|
| 209 |
-
outputs = gen_model.model.encoder(
|
|
|
|
|
|
|
| 210 |
model_output = outputs.last_hidden_state
|
| 211 |
-
|
| 212 |
-
"""input_mask_expanded = attention_mask.unsqueeze(-1).expand(model_output.size()).float()
|
| 213 |
-
sum_embeddings = torch.sum(model_output * input_mask_expanded, 1)
|
| 214 |
-
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 215 |
-
model_output = sum_embeddings / sum_mask"""
|
| 216 |
return model_output, attention_mask
|
| 217 |
|
| 218 |
|
|
@@ -227,8 +184,13 @@ def generate_canonical(smiles):
|
|
| 227 |
noise = i / 10
|
| 228 |
perturbed_latent = perturb_latent(latent_vec, noise_scale=noise)
|
| 229 |
gen = generate(perturbed_latent, mask)
|
| 230 |
-
|
| 231 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
if gen_mol:
|
| 234 |
# Calculate properties for ref and gen molecules
|
|
@@ -240,9 +202,20 @@ def generate_canonical(smiles):
|
|
| 240 |
# Prepare the table with ref mol and gen mol
|
| 241 |
data = {
|
| 242 |
"Property": ["QED", "SA", "LogP", "Mol Wt", "Tanimoto Similarity"],
|
| 243 |
-
"Reference Mol": [
|
| 244 |
-
|
| 245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
}
|
| 247 |
df = pd.DataFrame(data)
|
| 248 |
|
|
@@ -255,7 +228,7 @@ def generate_canonical(smiles):
|
|
| 255 |
|
| 256 |
|
| 257 |
# Function to display evaluation score
|
| 258 |
-
def display_eval(selected_models, dataset, task_type, downstream, fusion_type):
|
| 259 |
result = None
|
| 260 |
|
| 261 |
try:
|
|
@@ -270,72 +243,87 @@ def display_eval(selected_models, dataset, task_type, downstream, fusion_type):
|
|
| 270 |
downstream_model = downstream_model.rstrip()
|
| 271 |
params = None
|
| 272 |
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
try:
|
| 277 |
if not selected_models:
|
| 278 |
return "Please select at least one enabled model."
|
| 279 |
|
| 280 |
-
if task_type == "Classification":
|
| 281 |
-
global roc_auc, fpr, tpr, x_batch, y_batch
|
| 282 |
-
elif task_type == "Regression":
|
| 283 |
-
global RMSE, y_batch_test, y_prob
|
| 284 |
-
|
| 285 |
if len(selected_models) > 1:
|
| 286 |
if task_type == "Classification":
|
| 287 |
-
#result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.multi_modal(model_list=selected_models,
|
| 288 |
-
# downstream_model="XGBClassifier",
|
| 289 |
-
# dataset=dataset.lower())
|
| 290 |
if downstream_model == "Default Settings":
|
| 291 |
downstream_model = "DefaultClassifier"
|
| 292 |
params = None
|
| 293 |
-
result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.multi_modal(model_list=selected_models,
|
| 294 |
-
downstream_model=downstream_model,
|
| 295 |
-
params = params,
|
| 296 |
-
dataset=dataset)
|
| 297 |
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
|
|
|
| 303 |
if downstream_model == "Default Settings":
|
| 304 |
downstream_model = "DefaultRegressor"
|
| 305 |
params = None
|
| 306 |
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
|
| 312 |
else:
|
| 313 |
if task_type == "Classification":
|
| 314 |
-
#result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.single_modal(model=selected_models[0],
|
| 315 |
-
# downstream_model="XGBClassifier",
|
| 316 |
-
# dataset=dataset.lower())
|
| 317 |
if downstream_model == "Default Settings":
|
| 318 |
downstream_model = "DefaultClassifier"
|
| 319 |
params = None
|
| 320 |
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
|
| 326 |
elif task_type == "Regression":
|
| 327 |
-
#result, RMSE, y_batch_test, y_prob = fm4m.single_modal(model=selected_models[0],
|
| 328 |
-
# downstream_model="XGBRegressor",
|
| 329 |
-
# dataset=dataset.lower())
|
| 330 |
-
|
| 331 |
if downstream_model == "Default Settings":
|
| 332 |
downstream_model = "DefaultRegressor"
|
| 333 |
params = None
|
| 334 |
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
|
| 340 |
if result == None:
|
| 341 |
result = "Data & Model Setting is incorrect"
|
|
@@ -345,23 +333,15 @@ def display_eval(selected_models, dataset, task_type, downstream, fusion_type):
|
|
| 345 |
|
| 346 |
|
| 347 |
# Function to handle plot display
|
| 348 |
-
def display_plot(plot_type):
|
| 349 |
fig, ax = plt.subplots()
|
| 350 |
|
| 351 |
if plot_type == "Latent Space":
|
| 352 |
-
|
| 353 |
ax.set_title("T-SNE Plot")
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 358 |
-
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
| 359 |
-
class_0 = x_batch # features_umap[index_0]
|
| 360 |
-
class_1 = y_batch # features_umap[index_1]
|
| 361 |
-
|
| 362 |
-
"""with open("latent_multi_bace.pkl", "rb") as f:
|
| 363 |
-
class_0, class_1 = pickle.load(f)
|
| 364 |
-
"""
|
| 365 |
plt.scatter(class_1[:, 0], class_1[:, 1], c='red', label='Class 1')
|
| 366 |
plt.scatter(class_0[:, 0], class_0[:, 1], c='blue', label='Class 0')
|
| 367 |
|
|
@@ -370,10 +350,16 @@ def display_plot(plot_type):
|
|
| 370 |
ax.set_title('Dataset Distribution')
|
| 371 |
|
| 372 |
elif plot_type == "ROC-AUC":
|
| 373 |
-
|
| 374 |
ax.set_title("ROC-AUC Curve")
|
| 375 |
try:
|
| 376 |
-
ax.plot(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
|
| 378 |
ax.set_xlim([0.0, 1.0])
|
| 379 |
ax.set_ylim([0.0, 1.05])
|
|
@@ -385,7 +371,11 @@ def display_plot(plot_type):
|
|
| 385 |
ax.legend(loc='lower right')
|
| 386 |
|
| 387 |
elif plot_type == "Parity Plot":
|
| 388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
ax.set_title("Parity plot")
|
| 390 |
|
| 391 |
# change format
|
|
@@ -394,7 +384,12 @@ def display_plot(plot_type):
|
|
| 394 |
print(y_prob)
|
| 395 |
y_batch_test = np.array(y_batch_test, dtype=float)
|
| 396 |
y_prob = np.array(y_prob, dtype=float)
|
| 397 |
-
ax.scatter(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
min_val = min(min(y_batch_test), min(y_prob))
|
| 399 |
max_val = max(max(y_batch_test), max(y_prob))
|
| 400 |
ax.plot([min_val, max_val], [min_val, max_val], 'r-')
|
|
@@ -407,10 +402,6 @@ def display_plot(plot_type):
|
|
| 407 |
print(y_batch_test)
|
| 408 |
print(y_prob)
|
| 409 |
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
ax.set_xlabel('Actual Values')
|
| 415 |
ax.set_ylabel('Predicted Values')
|
| 416 |
|
|
@@ -429,13 +420,25 @@ predefined_datasets = {
|
|
| 429 |
# Function to load a predefined dataset from the local path
|
| 430 |
def load_predefined_dataset(dataset_name):
|
| 431 |
val = predefined_datasets.get(dataset_name)
|
| 432 |
-
try:
|
| 433 |
-
|
|
|
|
|
|
|
| 434 |
|
| 435 |
if file_path:
|
| 436 |
df = pd.read_csv(file_path)
|
| 437 |
-
return
|
| 438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
|
| 440 |
|
| 441 |
# Function to display the head of the uploaded CSV file
|
|
@@ -443,7 +446,11 @@ def display_csv_head(file):
|
|
| 443 |
if file is not None:
|
| 444 |
# Load the CSV file into a DataFrame
|
| 445 |
df = pd.read_csv(file.name)
|
| 446 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
return pd.DataFrame(), gr.update(choices=[]), gr.update(choices=[])
|
| 448 |
|
| 449 |
|
|
@@ -451,28 +458,54 @@ def display_csv_head(file):
|
|
| 451 |
def handle_dataset_selection(selected_dataset):
|
| 452 |
if selected_dataset == "Custom Dataset":
|
| 453 |
# Show file upload fields for train and test datasets if "Custom Dataset" is selected
|
| 454 |
-
return
|
| 455 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
else:
|
| 457 |
-
return
|
| 458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
|
| 460 |
|
| 461 |
# Function to select input and output columns and display a message
|
| 462 |
-
def select_columns(input_column, output_column, train_data, test_data,dataset_name):
|
| 463 |
if input_column and output_column:
|
| 464 |
return f"{train_data.name},{test_data.name},{input_column},{output_column},{dataset_name}"
|
| 465 |
return "Please select both input and output columns."
|
| 466 |
|
| 467 |
-
|
|
|
|
| 468 |
if dataset_selector == "Custom Dataset":
|
| 469 |
return f"{dataset_name}"
|
| 470 |
return f"{dataset_selector}"
|
| 471 |
|
|
|
|
| 472 |
# Function to create model based on user input
|
| 473 |
-
def create_model(
|
|
|
|
|
|
|
| 474 |
if model_name == "XGBClassifier":
|
| 475 |
-
model = xgb.XGBClassifier(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
elif model_name == "SVR":
|
| 477 |
model = SVR(degree=degree, kernel=kernel)
|
| 478 |
elif model_name == "Kernel Ridge":
|
|
@@ -486,224 +519,339 @@ def create_model(model_name, max_depth=None, n_estimators=None, alpha=None, degr
|
|
| 486 |
return "Model not supported."
|
| 487 |
|
| 488 |
return f"{model_name} * {model.get_params()}"
|
| 489 |
-
def model_selector(model_name):
|
| 490 |
-
# Dynamically return the appropriate hyperparameter components based on the selected model
|
| 491 |
-
if model_name == "XGBClassifier":
|
| 492 |
-
return (
|
| 493 |
-
gr.Slider(1, 10, label="max_depth"),
|
| 494 |
-
gr.Slider(50, 500, label="n_estimators"),
|
| 495 |
-
gr.Slider(0.1, 10.0, step=0.1, label="alpha")
|
| 496 |
-
)
|
| 497 |
-
elif model_name == "SVR":
|
| 498 |
-
return (
|
| 499 |
-
gr.Slider(1, 5, label="degree"),
|
| 500 |
-
gr.Dropdown(["rbf", "poly", "linear"], label="kernel")
|
| 501 |
-
)
|
| 502 |
-
elif model_name == "Kernel Ridge":
|
| 503 |
-
return (
|
| 504 |
-
gr.Slider(0.1, 10.0, step=0.1, label="alpha"),
|
| 505 |
-
gr.Slider(1, 5, label="degree"),
|
| 506 |
-
gr.Dropdown(["rbf", "poly", "linear"], label="kernel")
|
| 507 |
-
)
|
| 508 |
-
elif model_name == "Linear Regression":
|
| 509 |
-
return () # No hyperparameters for Linear Regression
|
| 510 |
-
else:
|
| 511 |
-
return ()
|
| 512 |
-
|
| 513 |
|
| 514 |
|
| 515 |
# Define the Gradio layout
|
| 516 |
-
# with gr.Blocks(theme=my_theme) as demo:
|
| 517 |
with gr.Blocks() as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
with gr.Row():
|
| 519 |
# Left Column
|
| 520 |
with gr.Column():
|
| 521 |
-
gr.HTML(
|
|
|
|
| 522 |
<div style="background-color: #6A8EAE; color: #FFFFFF; padding: 10px;">
|
| 523 |
<h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Data & Model Setting</h3>
|
| 524 |
</div>
|
| 525 |
-
'''
|
| 526 |
-
|
| 527 |
-
#dataset_dropdown = gr.Dropdown(choices=datasets, label="Select Dat")
|
| 528 |
-
|
| 529 |
# Dropdown menu for predefined datasets including "Custom Dataset" option
|
| 530 |
-
dataset_selector = gr.Dropdown(
|
| 531 |
-
|
|
|
|
|
|
|
| 532 |
# Display the message for selected columns
|
| 533 |
-
selected_columns_message = gr.Textbox(
|
|
|
|
|
|
|
| 534 |
|
| 535 |
with gr.Accordion("Dataset Settings", open=True):
|
| 536 |
# File upload options for custom dataset (train and test)
|
| 537 |
dataset_name = gr.Textbox(label="Dataset Name", visible=False)
|
| 538 |
-
train_file = gr.File(
|
| 539 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
|
| 541 |
-
test_file = gr.File(
|
| 542 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
|
| 544 |
# Predefined dataset displays
|
| 545 |
-
predefined_display = gr.Dataframe(
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
|
|
|
| 549 |
|
| 550 |
# Dropdowns for selecting input and output columns for the custom dataset
|
| 551 |
-
input_column_selector = gr.Dropdown(
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
|
|
|
|
|
|
| 555 |
|
| 556 |
# When a dataset is selected, show either file upload fields (for custom) or load predefined datasets
|
| 557 |
-
dataset_selector.change(
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
|
| 562 |
# When a predefined dataset is selected, load its head and update column selectors
|
| 563 |
-
dataset_selector.change(
|
| 564 |
-
|
| 565 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
|
| 567 |
# When a custom train file is uploaded, display its head and update column selectors
|
| 568 |
-
train_file.change(
|
| 569 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 570 |
|
| 571 |
# When a custom test file is uploaded, display its head
|
| 572 |
-
test_file.change(
|
| 573 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
|
| 575 |
-
dataset_selector.change(
|
| 576 |
-
|
| 577 |
-
|
|
|
|
|
|
|
| 578 |
|
| 579 |
# Update the selected columns information when dropdown values are changed
|
| 580 |
-
input_column_selector.change(
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
|
| 588 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
|
| 594 |
-
task_radiobutton = gr.Radio(
|
|
|
|
|
|
|
| 595 |
|
| 596 |
####### adding hyper parameter tuning ###########
|
| 597 |
-
model_name = gr.Dropdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
with gr.Accordion("Downstream Hyperparameter Settings", open=True):
|
| 599 |
# Create placeholders for hyperparameter components
|
| 600 |
-
max_depth = gr.Slider(1, 20, step=1,visible=False, label="max_depth")
|
| 601 |
-
n_estimators = gr.Slider(
|
|
|
|
|
|
|
| 602 |
alpha = gr.Slider(0.1, 10.0, step=0.1, visible=False, label="alpha")
|
| 603 |
-
degree = gr.Slider(1, 20, step=1,visible=False, label="degree")
|
| 604 |
-
kernel = gr.Dropdown(
|
|
|
|
|
|
|
| 605 |
|
| 606 |
# Output textbox
|
| 607 |
output = gr.Textbox(label="Loaded Parameters")
|
| 608 |
|
| 609 |
-
|
| 610 |
# Dynamically show relevant hyperparameters based on selected model
|
| 611 |
def update_hyperparameters(model_name):
|
| 612 |
if model_name == "XGBClassifier":
|
| 613 |
-
return
|
| 614 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
elif model_name == "SVR":
|
| 616 |
-
return
|
| 617 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
elif model_name == "Kernel Ridge":
|
| 619 |
-
return
|
| 620 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
elif model_name == "Linear Regression":
|
| 622 |
-
return
|
| 623 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 624 |
elif model_name == "Default - Auto":
|
| 625 |
-
return
|
| 626 |
-
|
| 627 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
|
| 629 |
# When model is selected, update which hyperparameters are visible
|
| 630 |
-
model_name.change(
|
| 631 |
-
|
|
|
|
|
|
|
|
|
|
| 632 |
|
| 633 |
# Submit button to create the model with selected hyperparameters
|
| 634 |
submit_button = gr.Button("Create Downstream Model")
|
| 635 |
|
| 636 |
-
|
| 637 |
# Function to handle model creation based on input parameters
|
| 638 |
def on_submit(model_name, max_depth, n_estimators, alpha, degree, kernel):
|
| 639 |
if model_name == "XGBClassifier":
|
| 640 |
-
return create_model(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
elif model_name == "SVR":
|
| 642 |
return create_model(model_name, degree=degree, kernel=kernel)
|
| 643 |
elif model_name == "Kernel Ridge":
|
| 644 |
-
return create_model(
|
|
|
|
|
|
|
| 645 |
elif model_name == "Linear Regression":
|
| 646 |
return create_model(model_name)
|
| 647 |
elif model_name == "Default - Auto":
|
| 648 |
return create_model(model_name)
|
| 649 |
|
| 650 |
# When the submit button is clicked, run the on_submit function
|
| 651 |
-
submit_button.click(
|
| 652 |
-
|
|
|
|
|
|
|
|
|
|
| 653 |
###### End of hyper param tuning #########
|
| 654 |
|
| 655 |
fusion_radiobutton = gr.Radio(choices=fusion_available, label="Fusion Type")
|
| 656 |
|
| 657 |
-
|
| 658 |
-
|
| 659 |
eval_button = gr.Button("Train downstream model")
|
| 660 |
-
#eval_button.style(css_class="custom-button-left")
|
| 661 |
|
| 662 |
# Middle Column
|
| 663 |
with gr.Column():
|
| 664 |
-
gr.HTML(
|
|
|
|
| 665 |
<div style="background-color: #8F9779; color: #FFFFFF; padding: 10px;">
|
| 666 |
<h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 1: Property Prediction</h3>
|
| 667 |
</div>
|
| 668 |
-
'''
|
| 669 |
-
|
| 670 |
eval_output = gr.Textbox(label="Train downstream model")
|
| 671 |
|
| 672 |
-
plot_radio = gr.Radio(
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
|
|
|
| 676 |
|
| 677 |
create_log = gr.Button("Store log")
|
| 678 |
|
| 679 |
-
log_table = gr.Dataframe(
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 687 |
|
| 688 |
# Function to gather selected models
|
| 689 |
def gather_selected_models(*models):
|
| 690 |
selected = [model for model in models if model]
|
| 691 |
return selected
|
| 692 |
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
# Right Column
|
| 700 |
with gr.Column():
|
| 701 |
-
gr.HTML(
|
|
|
|
| 702 |
<div style="background-color: #D2B48C; color: #FFFFFF; padding: 10px;">
|
| 703 |
<h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 2: Molecule Generation</h3>
|
| 704 |
</div>
|
| 705 |
-
'''
|
| 706 |
-
|
| 707 |
smiles_input = gr.Textbox(label="Input SMILES String")
|
| 708 |
image_display = gr.Image(label="Molecule Image", height=250, width=250)
|
| 709 |
# Show images for selection
|
|
@@ -712,24 +860,32 @@ with gr.Blocks() as demo:
|
|
| 712 |
choices=list(smiles_image_mapping.keys()),
|
| 713 |
label="Select from sample molecules",
|
| 714 |
value=None,
|
| 715 |
-
#item_images=[load_image(smiles_image_mapping[key]["image"]) for key in smiles_image_mapping.keys()]
|
| 716 |
)
|
| 717 |
image_selector.change(load_image, image_selector, image_display)
|
| 718 |
generate_button = gr.Button("Generate")
|
| 719 |
-
gen_image_display = gr.Image(
|
|
|
|
|
|
|
| 720 |
generated_output = gr.Textbox(label="Generated Output")
|
| 721 |
property_table = gr.Dataframe(label="Molecular Properties Comparison")
|
| 722 |
|
| 723 |
-
|
| 724 |
-
|
| 725 |
# Handle image selection
|
| 726 |
-
image_selector.change(
|
| 727 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 728 |
|
| 729 |
# Generate button to display canonical SMILES and molecule image
|
| 730 |
-
generate_button.click(
|
| 731 |
-
|
|
|
|
|
|
|
|
|
|
| 732 |
|
| 733 |
|
| 734 |
if __name__ == "__main__":
|
| 735 |
-
demo.launch(
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
import matplotlib.pyplot as plt
|
| 3 |
+
import numpy as np
|
| 4 |
+
import os
|
|
|
|
| 5 |
import pandas as pd
|
| 6 |
+
import re
|
|
|
|
|
|
|
|
|
|
| 7 |
import selfies as sf
|
|
|
|
| 8 |
import torch
|
|
|
|
|
|
|
|
|
|
| 9 |
import xgboost as xgb
|
| 10 |
+
from PIL import Image
|
| 11 |
+
from rdkit import Chem, RDLogger
|
| 12 |
+
from rdkit.Chem import DataStructs, AllChem, Descriptors, QED, Draw
|
| 13 |
+
from rdkit.Chem.Crippen import MolLogP
|
| 14 |
+
from rdkit.Contrib.SA_Score import sascorer
|
| 15 |
from sklearn.kernel_ridge import KernelRidge
|
| 16 |
+
from sklearn.linear_model import LinearRegression
|
| 17 |
+
from sklearn.svm import SVR
|
| 18 |
+
from transformers import BartForConditionalGeneration, AutoTokenizer
|
| 19 |
+
from transformers.modeling_outputs import BaseModelOutput
|
| 20 |
|
| 21 |
os.environ["OMP_MAX_ACTIVE_LEVELS"] = "1"
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
import models.fm4m as fm4m
|
| 24 |
|
| 25 |
+
RDLogger.logger().setLevel(RDLogger.ERROR)
|
| 26 |
+
|
| 27 |
|
| 28 |
# Function to display molecule image from SMILES
|
| 29 |
def smiles_to_image(smiles):
|
| 30 |
mol = Chem.MolFromSmiles(smiles)
|
| 31 |
+
return Draw.MolToImage(mol) if mol else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
# Dictionary for SMILES strings and corresponding images (you can replace with your actual image paths)
|
| 35 |
smiles_image_mapping = {
|
| 36 |
+
"Mol 1": {
|
| 37 |
+
"smiles": "C=C(C)CC(=O)NC[C@H](CO)NC(=O)C=Cc1ccc(C)c(Cl)c1",
|
| 38 |
+
"image": "img/img1.png",
|
| 39 |
+
},
|
| 40 |
# Example SMILES for ethanol
|
| 41 |
+
"Mol 2": {
|
| 42 |
+
"smiles": "C=CC1(CC(=O)NC[C@@H](CCCC)NC(=O)c2cc(Cl)cc(Br)c2)CC1",
|
| 43 |
+
"image": "img/img2.png",
|
| 44 |
+
},
|
| 45 |
# Example SMILES for butane
|
| 46 |
+
"Mol 3": {
|
| 47 |
+
"smiles": "C=C(C)C[C@H](NC(C)=O)C(=O)N1CC[C@H](NC(=O)[C@H]2C[C@@]2(C)Br)C(C)(C)C1",
|
| 48 |
+
"image": "img/img3.png",
|
| 49 |
+
}, # Example SMILES for ethylamine
|
| 50 |
+
"Mol 4": {
|
| 51 |
+
"smiles": "C=C1CC(CC(=O)N[C@H]2CCN(C(=O)c3ncccc3SC)C23CC3)C1",
|
| 52 |
+
"image": "img/img4.png",
|
| 53 |
+
},
|
| 54 |
# Example SMILES for diethyl ether
|
| 55 |
+
"Mol 5": {
|
| 56 |
+
"smiles": "C=CCS[C@@H](C)CC(=O)OCC",
|
| 57 |
+
"image": "img/img5.png",
|
| 58 |
+
}, # Example SMILES for chloroethane
|
| 59 |
}
|
| 60 |
|
| 61 |
datasets = [" ", "BACE", "ESOL", "Load Custom Dataset"]
|
| 62 |
|
| 63 |
+
models_enabled = [
|
| 64 |
+
"SELFIES-TED",
|
| 65 |
+
"MHG-GED",
|
| 66 |
+
"MolFormer",
|
| 67 |
+
"SMI-TED",
|
| 68 |
+
"Mordred",
|
| 69 |
+
"MorganFingerprint",
|
| 70 |
+
]
|
| 71 |
|
| 72 |
fusion_available = ["Concat"]
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
# Function to handle evaluation and logging
|
| 76 |
+
def evaluate_and_log(models, dataset, task_type, eval_output, state):
|
|
|
|
|
|
|
| 77 |
task_dic = {'Classification': 'CLS', 'Regression': 'RGR'}
|
| 78 |
+
result = f"{eval_output}"
|
| 79 |
result = result.replace(" Score", "")
|
| 80 |
|
| 81 |
+
new_entry = {
|
| 82 |
+
"Selected Models": str(models),
|
| 83 |
+
"Dataset": dataset,
|
| 84 |
+
"Task": task_dic[task_type],
|
| 85 |
+
"Result": result,
|
| 86 |
+
}
|
| 87 |
new_entry_df = pd.DataFrame([new_entry])
|
| 88 |
|
| 89 |
+
state["log_df"] = pd.concat([new_entry_df, state["log_df"]])
|
| 90 |
+
return state["log_df"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
# Load images for selection
|
| 94 |
def load_image(path):
|
| 95 |
try:
|
| 96 |
+
return Image.open(smiles_image_mapping[path]["image"])
|
| 97 |
except:
|
| 98 |
pass
|
| 99 |
|
| 100 |
|
|
|
|
| 101 |
# Function to handle image selection
|
| 102 |
def handle_image_selection(image_key):
|
| 103 |
smiles = smiles_image_mapping[image_key]["smiles"]
|
|
|
|
| 121 |
mol1 = Chem.MolFromSmiles(smiles1)
|
| 122 |
mol2 = Chem.MolFromSmiles(smiles2)
|
| 123 |
if mol1 and mol2:
|
|
|
|
|
|
|
| 124 |
fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2)
|
| 125 |
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2)
|
| 126 |
return round(DataStructs.FingerprintSimilarity(fp1, fp2), 2)
|
| 127 |
return None
|
| 128 |
|
| 129 |
|
|
|
|
|
|
|
|
|
|
| 130 |
gen_tokenizer = AutoTokenizer.from_pretrained("ibm/materials.selfies-ted")
|
| 131 |
gen_model = BartForConditionalGeneration.from_pretrained("ibm/materials.selfies-ted")
|
| 132 |
|
| 133 |
|
| 134 |
def generate(latent_vector, mask):
|
| 135 |
encoder_outputs = BaseModelOutput(latent_vector)
|
| 136 |
+
decoder_output = gen_model.generate(
|
| 137 |
+
encoder_outputs=encoder_outputs,
|
| 138 |
+
attention_mask=mask,
|
| 139 |
+
max_new_tokens=64,
|
| 140 |
+
do_sample=True,
|
| 141 |
+
top_k=5,
|
| 142 |
+
top_p=0.95,
|
| 143 |
+
num_return_sequences=1,
|
| 144 |
+
)
|
| 145 |
selfies = gen_tokenizer.batch_decode(decoder_output, skip_special_tokens=True)
|
| 146 |
+
return [sf.decoder(re.sub(r'\]\s*(.*?)\s*\[', r']\1[', i)) for i in selfies]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
|
| 149 |
def perturb_latent(latent_vecs, noise_scale=0.5):
|
| 150 |
+
return (
|
| 151 |
+
torch.tensor(
|
| 152 |
+
np.random.uniform(0, 1, latent_vecs.shape) * noise_scale,
|
| 153 |
+
dtype=torch.float32,
|
| 154 |
+
)
|
| 155 |
+
+ latent_vecs
|
| 156 |
+
)
|
| 157 |
|
| 158 |
|
| 159 |
def encode(selfies):
|
| 160 |
+
encoding = gen_tokenizer(
|
| 161 |
+
selfies,
|
| 162 |
+
return_tensors='pt',
|
| 163 |
+
max_length=128,
|
| 164 |
+
truncation=True,
|
| 165 |
+
padding='max_length',
|
| 166 |
+
)
|
| 167 |
input_ids = encoding['input_ids']
|
| 168 |
attention_mask = encoding['attention_mask']
|
| 169 |
+
outputs = gen_model.model.encoder(
|
| 170 |
+
input_ids=input_ids, attention_mask=attention_mask
|
| 171 |
+
)
|
| 172 |
model_output = outputs.last_hidden_state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
return model_output, attention_mask
|
| 174 |
|
| 175 |
|
|
|
|
| 184 |
noise = i / 10
|
| 185 |
perturbed_latent = perturb_latent(latent_vec, noise_scale=noise)
|
| 186 |
gen = generate(perturbed_latent, mask)
|
| 187 |
+
mol = Chem.MolFromSmiles(gen[0])
|
| 188 |
+
if mol:
|
| 189 |
+
gen_mol = Chem.MolToSmiles(mol)
|
| 190 |
+
if gen_mol != Chem.MolToSmiles(Chem.MolFromSmiles(smiles)):
|
| 191 |
+
break
|
| 192 |
+
else:
|
| 193 |
+
print('Abnormal molecule:', gen[0])
|
| 194 |
|
| 195 |
if gen_mol:
|
| 196 |
# Calculate properties for ref and gen molecules
|
|
|
|
| 202 |
# Prepare the table with ref mol and gen mol
|
| 203 |
data = {
|
| 204 |
"Property": ["QED", "SA", "LogP", "Mol Wt", "Tanimoto Similarity"],
|
| 205 |
+
"Reference Mol": [
|
| 206 |
+
ref_properties[0],
|
| 207 |
+
ref_properties[1],
|
| 208 |
+
ref_properties[2],
|
| 209 |
+
ref_properties[3],
|
| 210 |
+
tanimoto_similarity,
|
| 211 |
+
],
|
| 212 |
+
"Generated Mol": [
|
| 213 |
+
gen_properties[0],
|
| 214 |
+
gen_properties[1],
|
| 215 |
+
gen_properties[2],
|
| 216 |
+
gen_properties[3],
|
| 217 |
+
"",
|
| 218 |
+
],
|
| 219 |
}
|
| 220 |
df = pd.DataFrame(data)
|
| 221 |
|
|
|
|
| 228 |
|
| 229 |
|
| 230 |
# Function to display evaluation score
|
| 231 |
+
def display_eval(selected_models, dataset, task_type, downstream, fusion_type, state):
|
| 232 |
result = None
|
| 233 |
|
| 234 |
try:
|
|
|
|
| 243 |
downstream_model = downstream_model.rstrip()
|
| 244 |
params = None
|
| 245 |
|
|
|
|
|
|
|
|
|
|
| 246 |
try:
|
| 247 |
if not selected_models:
|
| 248 |
return "Please select at least one enabled model."
|
| 249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
if len(selected_models) > 1:
|
| 251 |
if task_type == "Classification":
|
|
|
|
|
|
|
|
|
|
| 252 |
if downstream_model == "Default Settings":
|
| 253 |
downstream_model = "DefaultClassifier"
|
| 254 |
params = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
+
(
|
| 257 |
+
result,
|
| 258 |
+
state["roc_auc"],
|
| 259 |
+
state["fpr"],
|
| 260 |
+
state["tpr"],
|
| 261 |
+
state["x_batch"],
|
| 262 |
+
state["y_batch"],
|
| 263 |
+
) = fm4m.multi_modal(
|
| 264 |
+
model_list=selected_models,
|
| 265 |
+
downstream_model=downstream_model,
|
| 266 |
+
params=params,
|
| 267 |
+
dataset=dataset,
|
| 268 |
+
)
|
| 269 |
|
| 270 |
+
elif task_type == "Regression":
|
| 271 |
if downstream_model == "Default Settings":
|
| 272 |
downstream_model = "DefaultRegressor"
|
| 273 |
params = None
|
| 274 |
|
| 275 |
+
(
|
| 276 |
+
result,
|
| 277 |
+
state["RMSE"],
|
| 278 |
+
state["y_batch_test"],
|
| 279 |
+
state["y_prob"],
|
| 280 |
+
state["x_batch"],
|
| 281 |
+
state["y_batch"],
|
| 282 |
+
) = fm4m.multi_modal(
|
| 283 |
+
model_list=selected_models,
|
| 284 |
+
downstream_model=downstream_model,
|
| 285 |
+
params=params,
|
| 286 |
+
dataset=dataset,
|
| 287 |
+
)
|
| 288 |
|
| 289 |
else:
|
| 290 |
if task_type == "Classification":
|
|
|
|
|
|
|
|
|
|
| 291 |
if downstream_model == "Default Settings":
|
| 292 |
downstream_model = "DefaultClassifier"
|
| 293 |
params = None
|
| 294 |
|
| 295 |
+
(
|
| 296 |
+
result,
|
| 297 |
+
state["roc_auc"],
|
| 298 |
+
state["fpr"],
|
| 299 |
+
state["tpr"],
|
| 300 |
+
state["x_batch"],
|
| 301 |
+
state["y_batch"],
|
| 302 |
+
) = fm4m.single_modal(
|
| 303 |
+
model=selected_models[0],
|
| 304 |
+
downstream_model=downstream_model,
|
| 305 |
+
params=params,
|
| 306 |
+
dataset=dataset,
|
| 307 |
+
)
|
| 308 |
|
| 309 |
elif task_type == "Regression":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
if downstream_model == "Default Settings":
|
| 311 |
downstream_model = "DefaultRegressor"
|
| 312 |
params = None
|
| 313 |
|
| 314 |
+
(
|
| 315 |
+
result,
|
| 316 |
+
state["RMSE"],
|
| 317 |
+
state["y_batch_test"],
|
| 318 |
+
state["y_prob"],
|
| 319 |
+
state["x_batch"],
|
| 320 |
+
state["y_batch"],
|
| 321 |
+
) = fm4m.single_modal(
|
| 322 |
+
model=selected_models[0],
|
| 323 |
+
downstream_model=downstream_model,
|
| 324 |
+
params=params,
|
| 325 |
+
dataset=dataset,
|
| 326 |
+
)
|
| 327 |
|
| 328 |
if result == None:
|
| 329 |
result = "Data & Model Setting is incorrect"
|
|
|
|
| 333 |
|
| 334 |
|
| 335 |
# Function to handle plot display
|
| 336 |
+
def display_plot(plot_type, state):
|
| 337 |
fig, ax = plt.subplots()
|
| 338 |
|
| 339 |
if plot_type == "Latent Space":
|
| 340 |
+
x_batch, y_batch = state.get("x_batch"), state.get("y_batch")
|
| 341 |
ax.set_title("T-SNE Plot")
|
| 342 |
+
class_0 = x_batch
|
| 343 |
+
class_1 = y_batch
|
| 344 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
plt.scatter(class_1[:, 0], class_1[:, 1], c='red', label='Class 1')
|
| 346 |
plt.scatter(class_0[:, 0], class_0[:, 1], c='blue', label='Class 0')
|
| 347 |
|
|
|
|
| 350 |
ax.set_title('Dataset Distribution')
|
| 351 |
|
| 352 |
elif plot_type == "ROC-AUC":
|
| 353 |
+
roc_auc, fpr, tpr = state.get("roc_auc"), state.get("fpr"), state.get("tpr")
|
| 354 |
ax.set_title("ROC-AUC Curve")
|
| 355 |
try:
|
| 356 |
+
ax.plot(
|
| 357 |
+
fpr,
|
| 358 |
+
tpr,
|
| 359 |
+
color='darkorange',
|
| 360 |
+
lw=2,
|
| 361 |
+
label=f'ROC curve (area = {roc_auc:.4f})',
|
| 362 |
+
)
|
| 363 |
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
|
| 364 |
ax.set_xlim([0.0, 1.0])
|
| 365 |
ax.set_ylim([0.0, 1.05])
|
|
|
|
| 371 |
ax.legend(loc='lower right')
|
| 372 |
|
| 373 |
elif plot_type == "Parity Plot":
|
| 374 |
+
RMSE, y_batch_test, y_prob = (
|
| 375 |
+
state.get("RMSE"),
|
| 376 |
+
state.get("y_batch_test"),
|
| 377 |
+
state.get("y_prob"),
|
| 378 |
+
)
|
| 379 |
ax.set_title("Parity plot")
|
| 380 |
|
| 381 |
# change format
|
|
|
|
| 384 |
print(y_prob)
|
| 385 |
y_batch_test = np.array(y_batch_test, dtype=float)
|
| 386 |
y_prob = np.array(y_prob, dtype=float)
|
| 387 |
+
ax.scatter(
|
| 388 |
+
y_batch_test,
|
| 389 |
+
y_prob,
|
| 390 |
+
color="blue",
|
| 391 |
+
label=f"Predicted vs Actual (RMSE: {RMSE:.4f})",
|
| 392 |
+
)
|
| 393 |
min_val = min(min(y_batch_test), min(y_prob))
|
| 394 |
max_val = max(max(y_batch_test), max(y_prob))
|
| 395 |
ax.plot([min_val, max_val], [min_val, max_val], 'r-')
|
|
|
|
| 402 |
print(y_batch_test)
|
| 403 |
print(y_prob)
|
| 404 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
ax.set_xlabel('Actual Values')
|
| 406 |
ax.set_ylabel('Predicted Values')
|
| 407 |
|
|
|
|
| 420 |
# Function to load a predefined dataset from the local path
|
| 421 |
def load_predefined_dataset(dataset_name):
|
| 422 |
val = predefined_datasets.get(dataset_name)
|
| 423 |
+
try:
|
| 424 |
+
file_path = val.split(",")[0]
|
| 425 |
+
except:
|
| 426 |
+
file_path = False
|
| 427 |
|
| 428 |
if file_path:
|
| 429 |
df = pd.read_csv(file_path)
|
| 430 |
+
return (
|
| 431 |
+
df.head(),
|
| 432 |
+
gr.update(choices=list(df.columns)),
|
| 433 |
+
gr.update(choices=list(df.columns)),
|
| 434 |
+
f"{dataset_name.lower()}",
|
| 435 |
+
)
|
| 436 |
+
return (
|
| 437 |
+
pd.DataFrame(),
|
| 438 |
+
gr.update(choices=[]),
|
| 439 |
+
gr.update(choices=[]),
|
| 440 |
+
f"Dataset not found",
|
| 441 |
+
)
|
| 442 |
|
| 443 |
|
| 444 |
# Function to display the head of the uploaded CSV file
|
|
|
|
| 446 |
if file is not None:
|
| 447 |
# Load the CSV file into a DataFrame
|
| 448 |
df = pd.read_csv(file.name)
|
| 449 |
+
return (
|
| 450 |
+
df.head(),
|
| 451 |
+
gr.update(choices=list(df.columns)),
|
| 452 |
+
gr.update(choices=list(df.columns)),
|
| 453 |
+
)
|
| 454 |
return pd.DataFrame(), gr.update(choices=[]), gr.update(choices=[])
|
| 455 |
|
| 456 |
|
|
|
|
| 458 |
def handle_dataset_selection(selected_dataset):
|
| 459 |
if selected_dataset == "Custom Dataset":
|
| 460 |
# Show file upload fields for train and test datasets if "Custom Dataset" is selected
|
| 461 |
+
return (
|
| 462 |
+
gr.update(visible=True),
|
| 463 |
+
gr.update(visible=True),
|
| 464 |
+
gr.update(visible=True),
|
| 465 |
+
gr.update(visible=True),
|
| 466 |
+
gr.update(visible=True),
|
| 467 |
+
gr.update(visible=False),
|
| 468 |
+
gr.update(visible=True),
|
| 469 |
+
gr.update(visible=True),
|
| 470 |
+
)
|
| 471 |
else:
|
| 472 |
+
return (
|
| 473 |
+
gr.update(visible=True),
|
| 474 |
+
gr.update(visible=False),
|
| 475 |
+
gr.update(visible=False),
|
| 476 |
+
gr.update(visible=False),
|
| 477 |
+
gr.update(visible=False),
|
| 478 |
+
gr.update(visible=False),
|
| 479 |
+
gr.update(visible=False),
|
| 480 |
+
gr.update(visible=False),
|
| 481 |
+
)
|
| 482 |
|
| 483 |
|
| 484 |
# Function to select input and output columns and display a message
|
| 485 |
+
def select_columns(input_column, output_column, train_data, test_data, dataset_name):
|
| 486 |
if input_column and output_column:
|
| 487 |
return f"{train_data.name},{test_data.name},{input_column},{output_column},{dataset_name}"
|
| 488 |
return "Please select both input and output columns."
|
| 489 |
|
| 490 |
+
|
| 491 |
+
def set_dataname(dataset_name, dataset_selector):
|
| 492 |
if dataset_selector == "Custom Dataset":
|
| 493 |
return f"{dataset_name}"
|
| 494 |
return f"{dataset_selector}"
|
| 495 |
|
| 496 |
+
|
| 497 |
# Function to create model based on user input
|
| 498 |
+
def create_model(
|
| 499 |
+
model_name, max_depth=None, n_estimators=None, alpha=None, degree=None, kernel=None
|
| 500 |
+
):
|
| 501 |
if model_name == "XGBClassifier":
|
| 502 |
+
model = xgb.XGBClassifier(
|
| 503 |
+
objective='binary:logistic',
|
| 504 |
+
eval_metric='auc',
|
| 505 |
+
max_depth=max_depth,
|
| 506 |
+
n_estimators=n_estimators,
|
| 507 |
+
alpha=alpha,
|
| 508 |
+
)
|
| 509 |
elif model_name == "SVR":
|
| 510 |
model = SVR(degree=degree, kernel=kernel)
|
| 511 |
elif model_name == "Kernel Ridge":
|
|
|
|
| 519 |
return "Model not supported."
|
| 520 |
|
| 521 |
return f"{model_name} * {model.get_params()}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
|
| 523 |
|
| 524 |
# Define the Gradio layout
|
|
|
|
| 525 |
with gr.Blocks() as demo:
|
| 526 |
+
log_df = pd.DataFrame(
|
| 527 |
+
{"": [], 'Selected Models': [], 'Dataset': [], 'Task': [], 'Result': []}
|
| 528 |
+
)
|
| 529 |
+
state = gr.State({"log_df": log_df})
|
| 530 |
with gr.Row():
|
| 531 |
# Left Column
|
| 532 |
with gr.Column():
|
| 533 |
+
gr.HTML(
|
| 534 |
+
'''
|
| 535 |
<div style="background-color: #6A8EAE; color: #FFFFFF; padding: 10px;">
|
| 536 |
<h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Data & Model Setting</h3>
|
| 537 |
</div>
|
| 538 |
+
'''
|
| 539 |
+
)
|
|
|
|
|
|
|
| 540 |
# Dropdown menu for predefined datasets including "Custom Dataset" option
|
| 541 |
+
dataset_selector = gr.Dropdown(
|
| 542 |
+
label="Select Dataset",
|
| 543 |
+
choices=list(predefined_datasets.keys()) + ["Custom Dataset"],
|
| 544 |
+
)
|
| 545 |
# Display the message for selected columns
|
| 546 |
+
selected_columns_message = gr.Textbox(
|
| 547 |
+
label="Selected Columns Info", visible=False
|
| 548 |
+
)
|
| 549 |
|
| 550 |
with gr.Accordion("Dataset Settings", open=True):
|
| 551 |
# File upload options for custom dataset (train and test)
|
| 552 |
dataset_name = gr.Textbox(label="Dataset Name", visible=False)
|
| 553 |
+
train_file = gr.File(
|
| 554 |
+
label="Upload Custom Train Dataset",
|
| 555 |
+
file_types=[".csv"],
|
| 556 |
+
visible=False,
|
| 557 |
+
)
|
| 558 |
+
train_display = gr.Dataframe(
|
| 559 |
+
label="Train Dataset Preview (First 5 Rows)",
|
| 560 |
+
visible=False,
|
| 561 |
+
interactive=False,
|
| 562 |
+
)
|
| 563 |
|
| 564 |
+
test_file = gr.File(
|
| 565 |
+
label="Upload Custom Test Dataset",
|
| 566 |
+
file_types=[".csv"],
|
| 567 |
+
visible=False,
|
| 568 |
+
)
|
| 569 |
+
test_display = gr.Dataframe(
|
| 570 |
+
label="Test Dataset Preview (First 5 Rows)",
|
| 571 |
+
visible=False,
|
| 572 |
+
interactive=False,
|
| 573 |
+
)
|
| 574 |
|
| 575 |
# Predefined dataset displays
|
| 576 |
+
predefined_display = gr.Dataframe(
|
| 577 |
+
label="Predefined Dataset Preview (First 5 Rows)",
|
| 578 |
+
visible=False,
|
| 579 |
+
interactive=False,
|
| 580 |
+
)
|
| 581 |
|
| 582 |
# Dropdowns for selecting input and output columns for the custom dataset
|
| 583 |
+
input_column_selector = gr.Dropdown(
|
| 584 |
+
label="Select Input Column", choices=[], visible=False
|
| 585 |
+
)
|
| 586 |
+
output_column_selector = gr.Dropdown(
|
| 587 |
+
label="Select Output Column", choices=[], visible=False
|
| 588 |
+
)
|
| 589 |
|
| 590 |
# When a dataset is selected, show either file upload fields (for custom) or load predefined datasets
|
| 591 |
+
dataset_selector.change(
|
| 592 |
+
handle_dataset_selection,
|
| 593 |
+
inputs=dataset_selector,
|
| 594 |
+
outputs=[
|
| 595 |
+
dataset_name,
|
| 596 |
+
train_file,
|
| 597 |
+
train_display,
|
| 598 |
+
test_file,
|
| 599 |
+
test_display,
|
| 600 |
+
predefined_display,
|
| 601 |
+
input_column_selector,
|
| 602 |
+
output_column_selector,
|
| 603 |
+
],
|
| 604 |
+
)
|
| 605 |
|
| 606 |
# When a predefined dataset is selected, load its head and update column selectors
|
| 607 |
+
dataset_selector.change(
|
| 608 |
+
load_predefined_dataset,
|
| 609 |
+
inputs=dataset_selector,
|
| 610 |
+
outputs=[
|
| 611 |
+
predefined_display,
|
| 612 |
+
input_column_selector,
|
| 613 |
+
output_column_selector,
|
| 614 |
+
selected_columns_message,
|
| 615 |
+
],
|
| 616 |
+
)
|
| 617 |
|
| 618 |
# When a custom train file is uploaded, display its head and update column selectors
|
| 619 |
+
train_file.change(
|
| 620 |
+
display_csv_head,
|
| 621 |
+
inputs=train_file,
|
| 622 |
+
outputs=[
|
| 623 |
+
train_display,
|
| 624 |
+
input_column_selector,
|
| 625 |
+
output_column_selector,
|
| 626 |
+
],
|
| 627 |
+
)
|
| 628 |
|
| 629 |
# When a custom test file is uploaded, display its head
|
| 630 |
+
test_file.change(
|
| 631 |
+
display_csv_head,
|
| 632 |
+
inputs=test_file,
|
| 633 |
+
outputs=[
|
| 634 |
+
test_display,
|
| 635 |
+
input_column_selector,
|
| 636 |
+
output_column_selector,
|
| 637 |
+
],
|
| 638 |
+
)
|
| 639 |
|
| 640 |
+
dataset_selector.change(
|
| 641 |
+
set_dataname,
|
| 642 |
+
inputs=[dataset_name, dataset_selector],
|
| 643 |
+
outputs=dataset_name,
|
| 644 |
+
)
|
| 645 |
|
| 646 |
# Update the selected columns information when dropdown values are changed
|
| 647 |
+
input_column_selector.change(
|
| 648 |
+
select_columns,
|
| 649 |
+
inputs=[
|
| 650 |
+
input_column_selector,
|
| 651 |
+
output_column_selector,
|
| 652 |
+
train_file,
|
| 653 |
+
test_file,
|
| 654 |
+
dataset_name,
|
| 655 |
+
],
|
| 656 |
+
outputs=selected_columns_message,
|
| 657 |
+
)
|
| 658 |
|
| 659 |
+
output_column_selector.change(
|
| 660 |
+
select_columns,
|
| 661 |
+
inputs=[
|
| 662 |
+
input_column_selector,
|
| 663 |
+
output_column_selector,
|
| 664 |
+
train_file,
|
| 665 |
+
test_file,
|
| 666 |
+
dataset_name,
|
| 667 |
+
],
|
| 668 |
+
outputs=selected_columns_message,
|
| 669 |
+
)
|
| 670 |
|
| 671 |
+
model_checkbox = gr.CheckboxGroup(
|
| 672 |
+
choices=models_enabled, label="Select Model"
|
| 673 |
+
)
|
| 674 |
|
| 675 |
+
task_radiobutton = gr.Radio(
|
| 676 |
+
choices=["Classification", "Regression"], label="Task Type"
|
| 677 |
+
)
|
| 678 |
|
| 679 |
####### adding hyper parameter tuning ###########
|
| 680 |
+
model_name = gr.Dropdown(
|
| 681 |
+
[
|
| 682 |
+
"Default - Auto",
|
| 683 |
+
"XGBClassifier",
|
| 684 |
+
"SVR",
|
| 685 |
+
"Kernel Ridge",
|
| 686 |
+
"Linear Regression",
|
| 687 |
+
],
|
| 688 |
+
label="Select Downstream Model",
|
| 689 |
+
)
|
| 690 |
with gr.Accordion("Downstream Hyperparameter Settings", open=True):
|
| 691 |
# Create placeholders for hyperparameter components
|
| 692 |
+
max_depth = gr.Slider(1, 20, step=1, visible=False, label="max_depth")
|
| 693 |
+
n_estimators = gr.Slider(
|
| 694 |
+
100, 5000, step=100, visible=False, label="n_estimators"
|
| 695 |
+
)
|
| 696 |
alpha = gr.Slider(0.1, 10.0, step=0.1, visible=False, label="alpha")
|
| 697 |
+
degree = gr.Slider(1, 20, step=1, visible=False, label="degree")
|
| 698 |
+
kernel = gr.Dropdown(
|
| 699 |
+
choices=["rbf", "poly", "linear"], visible=False, label="kernel"
|
| 700 |
+
)
|
| 701 |
|
| 702 |
# Output textbox
|
| 703 |
output = gr.Textbox(label="Loaded Parameters")
|
| 704 |
|
|
|
|
| 705 |
# Dynamically show relevant hyperparameters based on selected model
|
| 706 |
def update_hyperparameters(model_name):
|
| 707 |
if model_name == "XGBClassifier":
|
| 708 |
+
return (
|
| 709 |
+
gr.update(visible=True),
|
| 710 |
+
gr.update(visible=True),
|
| 711 |
+
gr.update(visible=True),
|
| 712 |
+
gr.update(visible=False),
|
| 713 |
+
gr.update(visible=False),
|
| 714 |
+
)
|
| 715 |
elif model_name == "SVR":
|
| 716 |
+
return (
|
| 717 |
+
gr.update(visible=False),
|
| 718 |
+
gr.update(visible=False),
|
| 719 |
+
gr.update(visible=False),
|
| 720 |
+
gr.update(visible=True),
|
| 721 |
+
gr.update(visible=True),
|
| 722 |
+
)
|
| 723 |
elif model_name == "Kernel Ridge":
|
| 724 |
+
return (
|
| 725 |
+
gr.update(visible=False),
|
| 726 |
+
gr.update(visible=False),
|
| 727 |
+
gr.update(visible=True),
|
| 728 |
+
gr.update(visible=True),
|
| 729 |
+
gr.update(visible=True),
|
| 730 |
+
)
|
| 731 |
elif model_name == "Linear Regression":
|
| 732 |
+
return (
|
| 733 |
+
gr.update(visible=False),
|
| 734 |
+
gr.update(visible=False),
|
| 735 |
+
gr.update(visible=False),
|
| 736 |
+
gr.update(visible=False),
|
| 737 |
+
gr.update(visible=False),
|
| 738 |
+
)
|
| 739 |
elif model_name == "Default - Auto":
|
| 740 |
+
return (
|
| 741 |
+
gr.update(visible=False),
|
| 742 |
+
gr.update(visible=False),
|
| 743 |
+
gr.update(visible=False),
|
| 744 |
+
gr.update(visible=False),
|
| 745 |
+
gr.update(visible=False),
|
| 746 |
+
)
|
| 747 |
|
| 748 |
# When model is selected, update which hyperparameters are visible
|
| 749 |
+
model_name.change(
|
| 750 |
+
update_hyperparameters,
|
| 751 |
+
inputs=[model_name],
|
| 752 |
+
outputs=[max_depth, n_estimators, alpha, degree, kernel],
|
| 753 |
+
)
|
| 754 |
|
| 755 |
# Submit button to create the model with selected hyperparameters
|
| 756 |
submit_button = gr.Button("Create Downstream Model")
|
| 757 |
|
|
|
|
| 758 |
# Function to handle model creation based on input parameters
|
| 759 |
def on_submit(model_name, max_depth, n_estimators, alpha, degree, kernel):
|
| 760 |
if model_name == "XGBClassifier":
|
| 761 |
+
return create_model(
|
| 762 |
+
model_name,
|
| 763 |
+
max_depth=max_depth,
|
| 764 |
+
n_estimators=n_estimators,
|
| 765 |
+
alpha=alpha,
|
| 766 |
+
)
|
| 767 |
elif model_name == "SVR":
|
| 768 |
return create_model(model_name, degree=degree, kernel=kernel)
|
| 769 |
elif model_name == "Kernel Ridge":
|
| 770 |
+
return create_model(
|
| 771 |
+
model_name, alpha=alpha, degree=degree, kernel=kernel
|
| 772 |
+
)
|
| 773 |
elif model_name == "Linear Regression":
|
| 774 |
return create_model(model_name)
|
| 775 |
elif model_name == "Default - Auto":
|
| 776 |
return create_model(model_name)
|
| 777 |
|
| 778 |
# When the submit button is clicked, run the on_submit function
|
| 779 |
+
submit_button.click(
|
| 780 |
+
on_submit,
|
| 781 |
+
inputs=[model_name, max_depth, n_estimators, alpha, degree, kernel],
|
| 782 |
+
outputs=output,
|
| 783 |
+
)
|
| 784 |
###### End of hyper param tuning #########
|
| 785 |
|
| 786 |
fusion_radiobutton = gr.Radio(choices=fusion_available, label="Fusion Type")
|
| 787 |
|
|
|
|
|
|
|
| 788 |
eval_button = gr.Button("Train downstream model")
|
|
|
|
| 789 |
|
| 790 |
# Middle Column
|
| 791 |
with gr.Column():
|
| 792 |
+
gr.HTML(
|
| 793 |
+
'''
|
| 794 |
<div style="background-color: #8F9779; color: #FFFFFF; padding: 10px;">
|
| 795 |
<h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 1: Property Prediction</h3>
|
| 796 |
</div>
|
| 797 |
+
'''
|
| 798 |
+
)
|
| 799 |
eval_output = gr.Textbox(label="Train downstream model")
|
| 800 |
|
| 801 |
+
plot_radio = gr.Radio(
|
| 802 |
+
choices=["ROC-AUC", "Parity Plot", "Latent Space"],
|
| 803 |
+
label="Select Plot Type",
|
| 804 |
+
)
|
| 805 |
+
plot_output = gr.Plot(label="Visualization")
|
| 806 |
|
| 807 |
create_log = gr.Button("Store log")
|
| 808 |
|
| 809 |
+
log_table = gr.Dataframe(
|
| 810 |
+
value=log_df, label="Log of Selections and Results", interactive=False
|
| 811 |
+
)
|
| 812 |
+
|
| 813 |
+
eval_button.click(
|
| 814 |
+
display_eval,
|
| 815 |
+
inputs=[
|
| 816 |
+
model_checkbox,
|
| 817 |
+
selected_columns_message,
|
| 818 |
+
task_radiobutton,
|
| 819 |
+
output,
|
| 820 |
+
fusion_radiobutton,
|
| 821 |
+
state,
|
| 822 |
+
],
|
| 823 |
+
outputs=eval_output,
|
| 824 |
+
)
|
| 825 |
+
|
| 826 |
+
plot_radio.change(
|
| 827 |
+
display_plot, inputs=[plot_radio, state], outputs=plot_output
|
| 828 |
+
)
|
| 829 |
|
| 830 |
# Function to gather selected models
|
| 831 |
def gather_selected_models(*models):
|
| 832 |
selected = [model for model in models if model]
|
| 833 |
return selected
|
| 834 |
|
| 835 |
+
create_log.click(
|
| 836 |
+
evaluate_and_log,
|
| 837 |
+
inputs=[
|
| 838 |
+
model_checkbox,
|
| 839 |
+
dataset_name,
|
| 840 |
+
task_radiobutton,
|
| 841 |
+
eval_output,
|
| 842 |
+
state,
|
| 843 |
+
],
|
| 844 |
+
outputs=log_table,
|
| 845 |
+
)
|
| 846 |
# Right Column
|
| 847 |
with gr.Column():
|
| 848 |
+
gr.HTML(
|
| 849 |
+
'''
|
| 850 |
<div style="background-color: #D2B48C; color: #FFFFFF; padding: 10px;">
|
| 851 |
<h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 2: Molecule Generation</h3>
|
| 852 |
</div>
|
| 853 |
+
'''
|
| 854 |
+
)
|
| 855 |
smiles_input = gr.Textbox(label="Input SMILES String")
|
| 856 |
image_display = gr.Image(label="Molecule Image", height=250, width=250)
|
| 857 |
# Show images for selection
|
|
|
|
| 860 |
choices=list(smiles_image_mapping.keys()),
|
| 861 |
label="Select from sample molecules",
|
| 862 |
value=None,
|
|
|
|
| 863 |
)
|
| 864 |
image_selector.change(load_image, image_selector, image_display)
|
| 865 |
generate_button = gr.Button("Generate")
|
| 866 |
+
gen_image_display = gr.Image(
|
| 867 |
+
label="Generated Molecule Image", height=250, width=250
|
| 868 |
+
)
|
| 869 |
generated_output = gr.Textbox(label="Generated Output")
|
| 870 |
property_table = gr.Dataframe(label="Molecular Properties Comparison")
|
| 871 |
|
|
|
|
|
|
|
| 872 |
# Handle image selection
|
| 873 |
+
image_selector.change(
|
| 874 |
+
handle_image_selection,
|
| 875 |
+
inputs=image_selector,
|
| 876 |
+
outputs=[smiles_input, image_display],
|
| 877 |
+
)
|
| 878 |
+
smiles_input.change(
|
| 879 |
+
smiles_to_image, inputs=smiles_input, outputs=image_display
|
| 880 |
+
)
|
| 881 |
|
| 882 |
# Generate button to display canonical SMILES and molecule image
|
| 883 |
+
generate_button.click(
|
| 884 |
+
generate_canonical,
|
| 885 |
+
inputs=smiles_input,
|
| 886 |
+
outputs=[property_table, generated_output, gen_image_display],
|
| 887 |
+
)
|
| 888 |
|
| 889 |
|
| 890 |
if __name__ == "__main__":
|
| 891 |
+
demo.launch(server_name="0.0.0.0")
|
data/lce/test.csv
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
smi1,conc1,smi2,conc2,smi3,conc3,smi4,conc4,smi5,conc5,smi6,conc6,LCE
|
| 2 |
+
C1C(OC(=O)O1)F,0.733,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.267,O,0.0,O,0.0,O,0.0,O,0.0,1.629
|
| 3 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.431,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,O,0.0,1.085
|
| 4 |
+
COC(=O)OC,0.299,C(C(F)(F)F)OCC(F)(F)F,0.598,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.103,O,0.0,O,0.0,O,0.0,2.056
|
| 5 |
+
COCCOC,0.358,O1CCOC1,0.532,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.074,[Li+].[N+](=O)([O-])[O-],,O,0.0,O,0.0,1.658
|
| 6 |
+
C1COC(=O)O1,0.197,COC(=O)OC,0.156,COCCOCCOCCOCCOC,0.59,[Li+].F[P-](F)(F)(F)(F)F,0.026,[Li+].[N+](=O)([O-])[O-],0.031,O,0.0,1.638
|
| 7 |
+
C1COC(=O)O1,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.276
|
| 8 |
+
O1CCOC1,0.368,COCCOC,0.547,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.076,CSi(C)(C)([N+]).C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.008,O,0.0,O,0.0,1.569
|
| 9 |
+
COCCOC,0.507,COC(C(F)(F)F)C(F)(F)F,0.399,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.095,O,0.0,O,0.0,O,0.0,2.268
|
| 10 |
+
C1COC(=O)O1,0.425,O=C(OCC)OCC(F)(F)F,0.481,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0.0,O,0.0,O,0.0,1.602
|
| 11 |
+
C1C(OC(=O)O1)F,0.318,CCOC(=O)OC,0.504,COC(=O)OC,0.094,B(O[Si](C)(C)C)(O[Si](C)(C)C)O[Si](C)(C),0.083,[Li+].F[P-](F)(F)(F)(F)F,0.001,O,0.0,1.678
|
| 12 |
+
O=S1(=O)CCCC1,0.359,C(C(F)(F)F)OC(C(F)F)(F)F,0.504,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.133,[Li+].[N+](=O)([O-])[O-],0.004,O,0.0,O,0.0,2.0
|
| 13 |
+
C1COC(=O)O1,0.594,O=C(OCC)OCC,0.327,[Li+].F[P-](F)(F)(F)(F)F,0.079,O,0.0,O,0.0,O,0.0,0.921
|
| 14 |
+
C1COC(=O)O1,0.331,O=C(OCC)OCC,0.577,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.092,O,0.0,O,0.0,O,0.0,1.301
|
| 15 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].C(C(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(C(F)(F)F)(F)F)(F)(F)F,0.069,O,0.0,O,0.0,0.854
|
| 16 |
+
C1C(OC(=O)O1)F,0.107,C1COC(=O)O1,0.526,O=C(OCC)OCC,0.289,[Li+].F[P-](F)(F)(F)(F)F,0.078,O,0.0,O,0.0,1.108
|
| 17 |
+
O1CCOC1,0.322,COCCOC,0.478,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.2,O,0.0,O,0.0,O,0.0,1.523
|
| 18 |
+
CC1COC(=O)O1,0.595,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.405,O,0.0,O,0.0,O,0.0,O,0.0,1.921
|
| 19 |
+
CC1COC(=O)O1,0.702,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.298,O,0.0,O,0.0,O,0.0,O,0.0,1.602
|
| 20 |
+
O1CCOC1,0.375,COCCOC,0.557,[Li+][S-]SSS[S-][Li+],,[Li+].[N+](=O)([O-])[O-],0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.061,O,0.0,1.523
|
| 21 |
+
COC(=O)OC,0.161,FC(F)C(F)(F)COC(F)(F)C(F)F,0.355,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.484,O,0.0,O,0.0,O,0.0,2.155
|
| 22 |
+
C1COC(=O)O1,0.338,COC(=O)OC,0.625,[Li+].[O-]P(=O)(F)F,0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.03,O,0.0,O,0.0,1.26
|
| 23 |
+
CN(C)C(=O)C(F)(F)F,0.362,C1C(OC(=O)O1)F,0.556,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.081,O,0.0,O,0.0,O,0.0,2.155
|
| 24 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.0,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.225
|
| 25 |
+
COCCOC,0.231,FC1CCCCC1,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,2.155
|
| 26 |
+
COCCOC,0.277,FC(F)C(F)(F)COC(F)(F)C(F)F,0.555,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.168,O,0.0,O,0.0,O,0.0,2.155
|
| 27 |
+
O1C(C)CCC1,0.331,FC(F)C(F)(F)COC(F)(F)C(F)F,0.498,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.171,O,0.0,O,0.0,O,0.0,2.301
|
| 28 |
+
COCC(F)(F)C(F)(F)COC,0.864,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.136,O,0.0,O,0.0,O,0.0,O,0.0,1.991
|
| 29 |
+
COC(=O)OC,0.29,C(C(F)(F)F)OCC(F)(F)F,0.589,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0.0,O,0.0,O,0.0,2.301
|
| 30 |
+
C1COC(=O)O1,0.425,O=C(OCC)OCC,0.234,[Li+].F[P-](F)(F)(F)(F)F,0.34,O,0.0,O,0.0,O,0.0,1.398
|
| 31 |
+
COCCOC,0.707,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.147,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.147,O,0.0,O,0.0,O,0.0,1.268
|
data/lce/test_data.csv
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
smiles1,conc1,mol1,smiles2,conc2,mol2,smiles3,conc3,mol3,smiles4,conc4,mol4,smiles5,conc5,mol5,smiles6,conc6,LCE_Predicted,LCE
|
| 2 |
+
C1COC(=O)O1,0.519,51.92400559,COC(=O)OC,0.411,41.14791596,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.069,6.928078454,O,0,0,O,0,0,O,0,1.187,1.094
|
| 3 |
+
COCCOC,0.596,59.5609428,COCCOCCOCCOCCOC,0.281,28.07124115,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.124,12.36781605,O,0,0,O,0,0,O,0,1.691,1.384
|
| 4 |
+
C1COC(=O)O1,0.285,28.50894036,C1C(OC(=O)O1)F,0.261,26.07552384,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.228,22.82322096,COC(=O)OC,0.226,22.59231484,O,0,0,O,0,1.508,1.468
|
| 5 |
+
COCCOC,0.434,43.4423376,COCCOCCOCCOCCOC,0.205,20.47449683,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.361,36.08316557,O,0,0,O,0,0,O,0,1.882,1.71
|
| 6 |
+
C1C(OC(=O)O1)F,0.187,18.72872664,COC(=O)OC,0.162,16.22691423,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.109,10.92850826,FC(F)C(F)(F)COC(F)(F)C(F)F,0.541,54.11585087,O,0,0,O,0,2.103,1.832
|
| 7 |
+
C1COC(=O)O1,0.134,13.35070843,C1C(OC(=O)O1)F,0.122,12.2111419,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.107,10.72028474,COC(=O)OC,0.106,10.57995858,FC(F)C(F)(F)COC(F)(F)C(F)F,0.531,53.13790635,O,0,2.077,2.104
|
| 8 |
+
COCCOC,0.096,9.614613177,COCCOCCOCCOCCOC,0.045,4.53139444,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.12,12.01491409,C1COCO1,0.143,14.28400162,FC(F)C(F)(F)COC(F)(F)C(F)F,0.596,59.55507668,O,0,2.211,2.274
|
| 9 |
+
C1COC(=O)O1,0.519,51.92400559,COC(=O)OC,0.411,41.14791596,[Li+].F[P-](F)(F)(F)(F)F,0.069,6.928078454,O,0,0,O,0,0,O,0,1.17,1.071
|
| 10 |
+
C1COC(=O)O1,0.519,51.92400559,COC(=O)OC,0.411,41.14791596,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.069,6.928078454,O,0,0,O,0,0,O,0,1.077,1.166
|
| 11 |
+
C1COC(=O)O1,0.519,51.85215842,COC(=O)OC,0.411,41.09097965,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.069,6.918492083,[Li+].[N+](=O)([O-])[O-],0.001,0.138369842,O,0,0,O,0,1.19,1.335
|
| 12 |
+
C1COC(=O)O1,0.513,51.33049845,COC(=O)OC,0.407,40.6775828,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.069,6.9173773,C1=COC(=O)O1,0.011,1.07454145,O,0,0,O,0,1.114,1.129
|
| 13 |
+
COCCOC,0.53,53.00533987,COCCOCCOCCOCCOC,0.25,24.98156691,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.22,22.01309322,O,0,0,O,0,0,O,0,1.758,1.501
|
| 14 |
+
COCCOC,0.477,47.74974224,COCCOCCOCCOCCOC,0.225,22.50458884,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.297,29.74566892,O,0,0,O,0,0,O,0,1.821,1.663
|
data/lce/train.csv
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
smi1,conc1,smi2,conc2,smi3,conc3,smi4,conc4,smi5,conc5,smi6,conc6,LCE
|
| 2 |
+
C1COC(=O)O1,0.327,O=C(OCC)OCC,0.594,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.079,O,0.0,O,0.0,O,0.0,1.155
|
| 3 |
+
C1COC(=O)O1,0.356,COC(=O)OC,0.566,FC(F)(F)COB(OCC(F)(F)F)OCC(F)(F)F,0.007,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.046
|
| 4 |
+
O=S1(=O)CCCC1,0.25,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.75,O,0.0,O,0.0,O,0.0,O,0.0,1.569
|
| 5 |
+
C1COC(=O)O1,0.331,O=C(OCC)OCC,0.577,[Li+].F[P-](F)(F)(F)(F)F,0.092,O,0.0,O,0.0,O,0.0,0.886
|
| 6 |
+
COCCOC,0.763,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.237,O,0.0,O,0.0,O,0.0,O,0.0,1.367
|
| 7 |
+
COCCOC,0.2,FC(F)C(F)(F)COC(F)(F)C(F)F,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.2,O,0.0,O,0.0,O,0.0,2.301
|
| 8 |
+
C1C(OC(=O)O1)F,0.873,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.127,O,0.0,O,0.0,O,0.0,O,0.0,1.489
|
| 9 |
+
COCCOC,0.706,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.008,[Li+].[O-]P(=O)(F)F,0.286,O,0.0,O,0.0,O,0.0,1.244
|
| 10 |
+
C1COC(=O)O1,0.3,CCOC(=O)OC,0.593,C1=COC(=O)O1,0.026,[Li+].F[P-](F)(F)(F)(F)F,0.081,O,0.0,O,0.0,0.745
|
| 11 |
+
COCCOC,0.763,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.174,[Li+].[O-]P(=O)(F)F,0.063,O,0.0,O,0.0,O,0.0,1.292
|
| 12 |
+
CCOCC,0.313,C(C(F)(F)F)OCC(F)(F)F,0.51,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.177,O,0.0,O,0.0,O,0.0,2.301
|
| 13 |
+
O=S1(=O)CCCC1,0.75,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.25,O,0.0,O,0.0,O,0.0,O,0.0,1.745
|
| 14 |
+
COC(=O)OC,0.29,C(C(F)(F)F)OCC(F)(F)F,0.589,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0.0,O,0.0,O,0.0,1.745
|
| 15 |
+
C1COC(=O)O1,0.682,CCOC(=O)OC,0.247,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.043,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.028,O,0.0,O,0.0,1.076
|
| 16 |
+
C1COC(=O)O1,0.359,COC(=O)OC,0.569,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,O,0.0,0.854
|
| 17 |
+
C1COC(=O)O1,0.305,COC(=O)OC,0.242,COCCOCCOCCOCCOC,0.392,[Li+].F[P-](F)(F)(F)(F)F,0.041,[Li+].[N+](=O)([O-])[O-],0.02,O,0.0,1.678
|
| 18 |
+
FC(F)(F)COCCOCC,0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0.0,O,0.0,O,0.0,O,0.0,2.155
|
| 19 |
+
CC#N,0.882,FC,0.065,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,,O,0.0,O,0.0,O,0.0,2.222
|
| 20 |
+
COC(C)C(C)OC,0.879,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0.0,O,0.0,O,0.0,O,0.0,1.638
|
| 21 |
+
CCOP(=O)(OCC)OCC,0.728,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.272,O,0.0,O,0.0,O,0.0,O,0.0,2.0
|
| 22 |
+
COC(=O)OC,0.375,FC(F)C(F)(F)COC(F)(F)C(F)F,0.375,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.25,O,0.0,O,0.0,O,0.0,1.854
|
| 23 |
+
O1CCOC1,0.371,COCCOC,0.552,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.077,O,0.0,O,0.0,O,0.0,1.959
|
| 24 |
+
C1C(OC(=O)O1)F,0.774,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.226,O,0.0,O,0.0,O,0.0,O,0.0,1.587
|
| 25 |
+
CC1COC(=O)O1,0.875,C1C(OC(=O)O1)F,0.051,[Li+].[O-]Cl(=O)(=O)=O,0.074,O,0.0,O,0.0,O,0.0,0.699
|
| 26 |
+
C1C(OC(=O)O1)F,0.264,COC(=O)OCCF,0.479,C(C(F)(F)F)OC(C(F)F)(F)F,0.155,[Li+].F[P-](F)(F)(F)(F)F,0.103,O,0.0,O,0.0,2.097
|
| 27 |
+
C1C(OC(=O)O1)F,0.413,O=C(OCC)OCC,0.497,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.09,O,0.0,O,0.0,O,0.0,1.59
|
| 28 |
+
C1C(OC(=O)O1)F,0.106,C1COC(=O)O1,0.522,O=C(OCC)OCC,0.287,[Li+].F[P-](F)(F)(F)(F)F,0.077,[Rb+].[O-][N+]([O-])=O,0.004,O1CCOCCOCCOCCOCCOCC1,0.004,1.252
|
| 29 |
+
COCCOC,0.259,B(OCC(F)(F)F)(OCC(F)(F)F)OCC(F)(F)F,0.556,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.185,O,0.0,O,0.0,O,0.0,1.337
|
| 30 |
+
C1CCOC1,0.925,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.075,O,0.0,O,0.0,O,0.0,O,0.0,1.377
|
| 31 |
+
C1C(OC(=O)O1)F,0.82,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.18,O,0.0,O,0.0,O,0.0,O,0.0,1.544
|
| 32 |
+
CCOP(=O)(OCC)OCC,0.5,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.5,O,0.0,O,0.0,O,0.0,O,0.0,2.097
|
| 33 |
+
COCCOC,0.731,[Li+].[O-]P(=O)(F)F,0.064,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.205,O,0.0,O,0.0,O,0.0,1.215
|
| 34 |
+
COCCOCCOCCOCCOC,0.819,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.181,O,0.0,O,0.0,O,0.0,O,0.0,1.222
|
| 35 |
+
C1COC(=O)O1,0.338,COC(=O)OC,0.625,[Li+].[O-]P(=O)(F)F,0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.03,O,0.0,O,0.0,1.194
|
| 36 |
+
O1CCOC1,0.463,COCCOC,0.312,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.194,[Li+].[N+](=O)([O-])[O-],0.03,O,0.0,O,0.0,1.824
|
| 37 |
+
C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.333
|
| 38 |
+
O1CCOC1,0.539,COCCOC,0.363,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.075,[Li+].[N+](=O)([O-])[O-],0.023,O,0.0,O,0.0,1.824
|
| 39 |
+
COCCOC,0.257,C(C(F)(F)F)OCC(F)(F)F,0.508,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.235,O,0.0,O,0.0,O,0.0,2.051
|
| 40 |
+
COCCOC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.047,[Li+].FP(F)(=O)([O-]),0.047,O,0.0,O,0.0,O,0.0,1.444
|
| 41 |
+
O1CCOC1,0.478,COCCOC,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.134,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.067,O,0.0,O,0.0,1.854
|
| 42 |
+
CCOCC,0.707,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.293,O,0.0,O,0.0,O,0.0,O,0.0,2.046
|
| 43 |
+
C1COC(=O)O1,0.563,O=C(OCC)OCC,0.31,C1C(OC(=O)O1)F,0.052,[Li+].F[P-](F)(F)(F)(F)F,0.075,O,0.0,O,0.0,1.301
|
| 44 |
+
C1CCOC1,0.942,FC,0.029,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,,O,0.0,O,0.0,O,0.0,2.222
|
| 45 |
+
O1CCOC1,0.478,COCCOC,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.2,O,0.0,O,0.0,O,0.0,1.903
|
| 46 |
+
COCCOC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0.0,O,0.0,O,0.0,O,0.0,1.561
|
| 47 |
+
C1C(OC(=O)O1)F,0.149,COC(=O)OCCF,0.178,C(C(F)(F)F)OC(C(F)F)(F)F,0.564,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.108,O,0.0,O,0.0,1.735
|
| 48 |
+
FC(F)COCCOCC(F)(F),0.845,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.155,O,0.0,O,0.0,O,0.0,O,0.0,2.301
|
| 49 |
+
C1C(OC(=O)O1)F,0.495,COC(=O)OC,0.429,O1CCOCCOCCOCC1,0.003,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.498
|
| 50 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.069,O,0.0,O,0.0,0.745
|
| 51 |
+
O=S1(=O)CCCC1,0.758,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.235,[Li+].[N+](=O)([O-])[O-],0.007,O,0.0,O,0.0,O,0.0,1.824
|
| 52 |
+
CCOCC,0.856,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.144,O,0.0,O,0.0,O,0.0,O,0.0,2.0
|
| 53 |
+
O=C(OCC)C,0.105,ClCCl,0.64,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.255,O,0.0,O,0.0,O,0.0,1.456
|
| 54 |
+
COCCOCCOCC(F)(F)OC(F)(F)OC(F)(F)COCCOCCOC,0.708,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.292,O,0.0,O,0.0,O,0.0,O,0.0,1.301
|
| 55 |
+
COCCOC,0.583,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.278,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.139,O,0.0,O,0.0,O,0.0,1.678
|
| 56 |
+
C1C(OC(=O)O1)F,0.662,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.338,O,0.0,O,0.0,O,0.0,O,0.0,1.646
|
| 57 |
+
O1CCOC1,0.397,COCCOC,0.589,[Li+][S-]SSS[S-][Li+],,[Li+].[N+](=O)([O-])[O-],0.012,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.002,O,0.0,1.301
|
| 58 |
+
C1COC(=O)O1,0.308,O=C(OCC)OCC(F)(F)F,0.349,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.343,O,0.0,O,0.0,O,0.0,2.046
|
| 59 |
+
C1COC(=O)O1,0.362,O=C(OCC)OCC,0.548,[Li+].F[P-](F)(F)(F)(F)F,0.09,O,0.0,O,0.0,O,0.0,0.788
|
| 60 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.001,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.373
|
| 61 |
+
O1CCOCC1,0.912,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.088,O,0.0,O,0.0,O,0.0,O,0.0,1.602
|
| 62 |
+
CC#N,0.621,C1=COC(=O)O1,0.056,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.323,O,0.0,O,0.0,O,0.0,1.854
|
| 63 |
+
COC(=O)OC,0.684,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.316,O,0.0,O,0.0,O,0.0,O,0.0,2.097
|
| 64 |
+
O=S1(=O)CCCC1,0.714,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.286,O,0.0,O,0.0,O,0.0,O,0.0,1.699
|
| 65 |
+
FC(F)(F)COCCOCC(F)(F)(F),0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0.0,O,0.0,O,0.0,O,0.0,2.155
|
| 66 |
+
CCOCC,0.64,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.36,O,0.0,O,0.0,O,0.0,O,0.0,2.208
|
| 67 |
+
COC(=O)OC,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.4,O,0.0,O,0.0,O,0.0,O,0.0,1.77
|
| 68 |
+
CC1COC(=O)O1,0.887,[Li+].F[As-](F)(F)(F)(F)F,0.113,O,0.0,O,0.0,O,0.0,O,0.0,0.824
|
| 69 |
+
C1COC(=O)O1,0.5,CCOC(=O)OC,0.423,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.046,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.031,O,0.0,O,0.0,0.924
|
| 70 |
+
CCOP(=O)(OCC)OCC,0.214,C(C(F)(F)F)OCC(F)(F)F,0.642,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.144,O,0.0,O,0.0,O,0.0,2.097
|
| 71 |
+
COCCOC,0.682,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.318,O,0.0,O,0.0,O,0.0,O,0.0,2.108
|
| 72 |
+
CC1COC(=O)O1,0.922,[LI+].F[B-](F)(F)OC(C(F)(F)(F))(C(F)(F)(F))C(F)(F)(F),0.078,O,0.0,O,0.0,O,0.0,O,0.0,0.712
|
| 73 |
+
C1COC(=O)O1,0.854,CCOC(=O)OC,0.08,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.039,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.026,O,0.0,O,0.0,1.081
|
| 74 |
+
C1COC(=O)O1,0.519,O=C(OCC)OCC,0.387,[Li+].F[P-](F)(F)(F)(F)F,0.082,[Li+].[O-]P(=O)(F)F,0.012,O,0.0,O,0.0,1.319
|
| 75 |
+
COC(=O)CC(F)(F)F,0.768,C1C(OC(=O)O1)F,0.134,[Li+].F[P-](F)(F)(F)(F)F,0.098,O,0.0,O,0.0,O,0.0,1.62
|
| 76 |
+
C1C(OC(=O)O1)F,0.144,COC(=O)OCCF,0.173,C(C(F)(F)F)OC(C(F)F)(F)F,0.548,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.135,O,0.0,O,0.0,2.222
|
| 77 |
+
C1COC(=O)O1,0.326,COC(=O)OC,0.602,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,O,0.0,0.777
|
| 78 |
+
CCOCC,0.877,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.123,O,0.0,O,0.0,O,0.0,O,0.0,2.018
|
| 79 |
+
COC(=O)OC,0.664,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.336,O,0.0,O,0.0,O,0.0,O,0.0,1.886
|
| 80 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].F[B-](F)(F)F,0.069,O,0.0,O,0.0,0.699
|
| 81 |
+
CCOP(=O)(OCC)OCC,0.648,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.352,O,0.0,O,0.0,O,0.0,O,0.0,1.569
|
| 82 |
+
C1C(OC(=O)O1)F,0.481,O=C(OCC)OCC,0.432,[Li+].F[P-](F)(F)(F)(F)F,0.087,O,0.0,O,0.0,O,0.0,1.523
|
| 83 |
+
COCCOC,0.231,FC(F)C(F)(F)COC(F)(F)C(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,2.155
|
| 84 |
+
C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.001,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.488
|
| 85 |
+
O1CCOC1,0.453,COCCOC,0.305,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.127,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.063,[Li+].[N+](=O)([O-])[O-],0.051,O,0.0,2.046
|
| 86 |
+
C1C(OC(=O)O1)F,0.932,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.068,O,0.0,O,0.0,O,0.0,O,0.0,1.41
|
| 87 |
+
COCCOC,0.139,COCC(F)(F)C(F)(F)C(F)(F)C(F)(F)COC,0.692,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.169,O,0.0,O,0.0,O,0.0,2.222
|
| 88 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.431,O1CCOCCOCCOCC1,0.0,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.559
|
| 89 |
+
COCCOC,0.231,FC(COC(OCC(F)(F)F)OCC(F)(F)F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,2.301
|
| 90 |
+
CN(C)S(=O)(=O)F,0.921,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.079,O,0.0,O,0.0,O,0.0,O,0.0,1.672
|
| 91 |
+
C1C(OC(=O)O1)F,0.105,C1COC(=O)O1,0.518,O=C(OCC)OCC,0.285,[Li+].F[P-](F)(F)(F)(F)F,0.077,[Rb+].[O-][N+]([O-])=O,0.008,O1CCOCCOCCOCCOCCOCC1,0.008,1.538
|
| 92 |
+
CC1CCC(C)O1,0.893,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.107,O,0.0,O,0.0,O,0.0,O,0.0,1.796
|
| 93 |
+
C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.355
|
| 94 |
+
C1COC(=O)O1,0.444,C1COS(=O)O1,0.497,[Li+].[O-]Cl(=O)(=O)=O,0.059,O,0.0,O,0.0,O,0.0,1.523
|
| 95 |
+
COCCOC,0.371,O1CCOC1,0.552,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.031,[Li+].[N+](=O)([O-])[O-],0.046,O,0.0,O,0.0,1.78
|
| 96 |
+
O=S1(=O)CCCC1,0.764,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.236,O,0.0,O,0.0,O,0.0,O,0.0,1.456
|
| 97 |
+
O1C(C)CCC1,0.908,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.092,O,0.0,O,0.0,O,0.0,O,0.0,1.745
|
| 98 |
+
O1CCOC1,0.362,C(C(F)(F)F)OCC(F)(F)F,0.59,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.048,O,0.0,O,0.0,O,0.0,1.967
|
| 99 |
+
COC(=O)OC,0.543,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.457,O,0.0,O,0.0,O,0.0,O,0.0,2.097
|
| 100 |
+
COCCOC,0.73,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.27,O,0.0,O,0.0,O,0.0,O,0.0,1.143
|
| 101 |
+
O1CCOC1,0.552,COCCOC,0.371,[Li+].[N+](=O)([O-])[O-],0.039,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.039,O,0.0,O,0.0,1.523
|
| 102 |
+
COCCOC,0.242,FC(COC(OCC(F)(F)F)OCC(F)(F)F)(F)F,0.604,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.154,O,0.0,O,0.0,O,0.0,2.301
|
| 103 |
+
CCOP(=O)(OCC)OCC,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.4,O,0.0,O,0.0,O,0.0,O,0.0,2.155
|
| 104 |
+
C1C(OC(=O)O1)F,0.318,CCOC(=O)OC,0.504,COC(=O)OC,0.094,[Li+].F[P-](F)(F)(F)(F)F,0.083,O,0.0,O,0.0,1.301
|
| 105 |
+
COCCOC,0.231,C(C(F)(F)F)OCC(F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,2.222
|
| 106 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].F[P-](F)(F)(F)(F)F,0.069,O,0.0,O,0.0,0.699
|
| 107 |
+
COCCOC,0.231,C(C(F)(F)F)OC(=O)OCC(F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,1.495
|
| 108 |
+
C1COC(=O)O1,0.32,COC(=O)OC,0.253,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.427,O,0.0,O,0.0,O,0.0,2.155
|
| 109 |
+
C1C(OC(=O)O1)F,0.312,O=C1OCCC1,0.599,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.068,[Li+].[N+](=O)([O-])[O-],0.021,O,0.0,O,0.0,1.921
|
| 110 |
+
COC(=O)OC,0.478,FC(F)C(F)(F)COC(F)(F)C(F)F,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.067,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.134,O,0.0,O,0.0,1.886
|
| 111 |
+
CCOP(=O)(OCC)OCC,0.259,FC(F)C(F)(F)COC(F)(F)C(F)F,0.556,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.185,O,0.0,O,0.0,O,0.0,2.046
|
| 112 |
+
COCCOC,0.677,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.323,O,0.0,O,0.0,O,0.0,O,0.0,1.745
|
| 113 |
+
C1C(OC(=O)O1)F,0.696,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.304,O,0.0,O,0.0,O,0.0,O,0.0,1.633
|
| 114 |
+
C1CCOC1,0.47,O1C(C)CCC1,0.378,[Li+].F[P-](F)(F)(F)(F)F,0.152,O,0.0,O,0.0,O,0.0,2.097
|
| 115 |
+
FC(F)COCCOCC(F)(F)(F),0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0.0,O,0.0,O,0.0,O,0.0,2.301
|
| 116 |
+
C1COC(=O)O1,0.496,COC(=O)OC,0.393,C1C(OC(=O)O1)F,0.045,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.066,O,0.0,O,0.0,1.108
|
| 117 |
+
C1C(OC(=O)O1)F,0.62,C(C(F)(F)F)OC(=O)OCC(F)(F)F,0.291,[Li+].F[P-](F)(F)(F)(F)F,0.089,O,0.0,O,0.0,O,0.0,1.62
|
| 118 |
+
CCOCC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0.0,O,0.0,O,0.0,O,0.0,1.959
|
| 119 |
+
C1COC(=O)O1,0.526,O=C(OCC)OCC,0.392,[Li+].F[P-](F)(F)(F)(F)F,0.083,O,0.0,O,0.0,O,0.0,1.013
|
| 120 |
+
C1COC(=O)O1,0.05,CCOC(=O)OC,0.237,C(C(F)(F)F)OCC(F)(F)F,0.575,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.123,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.015,O,0.0,1.824
|
| 121 |
+
O=S1(=O)CCCC1,0.429,FC(F)C(F)(F)COC(F)(F)C(F)F,0.429,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.143,O,0.0,O,0.0,O,0.0,1.921
|
data/lce/train_data.csv
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
smiles1,conc1,smiles2,conc2,smiles3,conc3,smiles4,conc4,smiles5,conc5,smiles6,conc6,LCE
|
| 2 |
+
CC1COC(=O)O1,0.875,C1C(OC(=O)O1)F,0.051,[Li+].[O-]Cl(=O)(=O)=O,0.074,O,0,O,0,O,0,0.699
|
| 3 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].F[P-](F)(F)(F)(F)F,0.069,O,0,O,0,0.699
|
| 4 |
+
FC(F)COCCOCC(F)(F),0.845,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.155,O,0,O,0,O,0,O,0,2.301
|
| 5 |
+
FC(F)COCCOCC(F)(F)(F),0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0,O,0,O,0,O,0,2.301
|
| 6 |
+
CN(C)C(=O)C(F)(F)F,0.362,C1C(OC(=O)O1)F,0.556,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.081,O,0,O,0,O,0,2.155
|
| 7 |
+
COCCOC,0.231,FC1CCCCC1,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,2.155
|
| 8 |
+
CCOP(=O)(OCC)OCC,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.4,O,0,O,0,O,0,O,0,2.155
|
| 9 |
+
O1CCOC1,0.362,C(C(F)(F)F)OCC(F)(F)F,0.59,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.048,O,0,O,0,O,0,1.967
|
| 10 |
+
COCC(F)(F)C(F)(F)COC,0.864,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.136,O,0,O,0,O,0,O,0,1.991
|
| 11 |
+
C1C(OC(=O)O1)F,0.662,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.338,O,0,O,0,O,0,O,0,1.646
|
| 12 |
+
COCCOC,0.358,O1CCOC1,0.532,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.074,[Li+].[N+](=O)([O-])[O-],0.035,O,0,O,0,1.658
|
| 13 |
+
CN(C)S(=O)(=O)F,0.921,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.079,O,0,O,0,O,0,O,0,1.672
|
| 14 |
+
C1C(OC(=O)O1)F,0.106,C1COC(=O)O1,0.522,O=C(OCC)OCC,0.287,[Li+].F[P-](F)(F)(F)(F)F,0.077,[Rb+].[O-][N+]([O-])=O,0.004,O1CCOCCOCCOCCOCCOCC1,0.004,1.252
|
| 15 |
+
C1COC(=O)O1,0.32,COC(=O)OC,0.253,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.427,O,0,O,0,O,0,2.155
|
| 16 |
+
COCCOC,0.277,FC(F)C(F)(F)COC(F)(F)C(F)F,0.555,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.168,O,0,O,0,O,0,2.155
|
| 17 |
+
COC(=O)OC,0.161,FC(F)C(F)(F)COC(F)(F)C(F)F,0.355,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.484,O,0,O,0,O,0,2.155
|
| 18 |
+
FC(F)(F)COCCOCC,0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0,O,0,O,0,O,0,2.155
|
| 19 |
+
FC(F)(F)COCCOCC(F)(F)(F),0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0,O,0,O,0,O,0,2.155
|
| 20 |
+
CCOCC,0.64,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.36,O,0,O,0,O,0,O,0,2.208
|
| 21 |
+
C1C(OC(=O)O1)F,0.144,COC(=O)OCCF,0.173,C(C(F)(F)F)OC(C(F)F)(F)F,0.548,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.135,O,0,O,0,2.222
|
| 22 |
+
CC#N,0.882,FC,0.065,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.054,O,0,O,0,O,0,2.222
|
| 23 |
+
C1CCOC1,0.942,FC,0.029,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.029,O,0,O,0,O,0,2.222
|
| 24 |
+
COCCOC,0.139,COCC(F)(F)C(F)(F)C(F)(F)C(F)(F)COC,0.692,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.169,O,0,O,0,O,0,2.222
|
| 25 |
+
COCCOC,0.231,C(C(F)(F)F)OCC(F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,2.222
|
| 26 |
+
COCCOC,0.507,COC(C(F)(F)F)C(F)(F)F,0.399,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.095,O,0,O,0,O,0,2.268
|
| 27 |
+
CCOCC,0.313,C(C(F)(F)F)OCC(F)(F)F,0.51,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.177,O,0,O,0,O,0,2.301
|
| 28 |
+
COC(=O)OC,0.29,C(C(F)(F)F)OCC(F)(F)F,0.589,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0,O,0,O,0,2.301
|
| 29 |
+
COCCOC,0.242,FC(COC(OCC(F)(F)F)OCC(F)(F)F)(F)F,0.604,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.154,O,0,O,0,O,0,2.301
|
| 30 |
+
O1C(C)CCC1,0.331,FC(F)C(F)(F)COC(F)(F)C(F)F,0.498,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.171,O,0,O,0,O,0,2.301
|
| 31 |
+
COCCOC,0.2,FC(F)C(F)(F)COC(F)(F)C(F)F,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.2,O,0,O,0,O,0,2.301
|
| 32 |
+
COCCOC,0.231,FC(COC(OCC(F)(F)F)OCC(F)(F)F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,2.301
|
| 33 |
+
O=S1(=O)CCCC1,0.359,C(C(F)(F)F)OC(C(F)F)(F)F,0.504,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.133,[Li+].[N+](=O)([O-])[O-],0.004,O,0,O,0,2
|
| 34 |
+
CCOCC,0.856,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.144,O,0,O,0,O,0,O,0,2
|
| 35 |
+
CCOCC,0.877,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.123,O,0,O,0,O,0,O,0,2.018
|
| 36 |
+
CCOCC,0.707,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.293,O,0,O,0,O,0,O,0,2.046
|
| 37 |
+
C1COC(=O)O1,0.308,O=C(OCC)OCC(F)(F)F,0.349,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.343,O,0,O,0,O,0,2.046
|
| 38 |
+
O1CCOC1,0.453,COCCOC,0.305,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.127,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.063,[Li+].[N+](=O)([O-])[O-],0.051,O,0,2.046
|
| 39 |
+
CCOP(=O)(OCC)OCC,0.259,FC(F)C(F)(F)COC(F)(F)C(F)F,0.556,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.185,O,0,O,0,O,0,2.046
|
| 40 |
+
COCCOC,0.257,C(C(F)(F)F)OCC(F)(F)F,0.508,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.235,O,0,O,0,O,0,2.051
|
| 41 |
+
COC(=O)OC,0.299,C(C(F)(F)F)OCC(F)(F)F,0.598,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.103,O,0,O,0,O,0,2.056
|
| 42 |
+
CCOP(=O)(OCC)OCC,0.214,C(C(F)(F)F)OCC(F)(F)F,0.642,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.144,O,0,O,0,O,0,2.097
|
| 43 |
+
COC(=O)OC,0.684,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.316,O,0,O,0,O,0,O,0,2.097
|
| 44 |
+
C1CCOC1,0.47,O1C(C)CCC1,0.378,[Li+].F[P-](F)(F)(F)(F)F,0.152,O,0,O,0,O,0,2.097
|
| 45 |
+
C1C(OC(=O)O1)F,0.264,COC(=O)OCCF,0.479,C(C(F)(F)F)OC(C(F)F)(F)F,0.155,[Li+].F[P-](F)(F)(F)(F)F,0.103,O,0,O,0,2.097
|
| 46 |
+
CCOP(=O)(OCC)OCC,0.5,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.5,O,0,O,0,O,0,O,0,2.097
|
| 47 |
+
COC(=O)OC,0.543,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.457,O,0,O,0,O,0,O,0,2.097
|
| 48 |
+
COCCOC,0.682,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.318,O,0,O,0,O,0,O,0,2.108
|
| 49 |
+
COCCOC,0.231,FC(F)C(F)(F)COC(F)(F)C(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,2.155
|
| 50 |
+
CCOP(=O)(OCC)OCC,0.728,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.272,O,0,O,0,O,0,O,0,2
|
| 51 |
+
COCCOC,0.583,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.278,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.139,O,0,O,0,O,0,1.678
|
| 52 |
+
C1COC(=O)O1,0.305,COC(=O)OC,0.242,COCCOCCOCCOCCOC,0.392,[Li+].F[P-](F)(F)(F)(F)F,0.041,[Li+].[N+](=O)([O-])[O-],0.02,O,0,1.678
|
| 53 |
+
C1C(OC(=O)O1)F,0.318,CCOC(=O)OC,0.504,COC(=O)OC,0.094,B(O[Si](C)(C)C)(O[Si](C)(C)C)O[Si](C)(C),0.083,[Li+].F[P-](F)(F)(F)(F)F,0.001,O,0,1.678
|
| 54 |
+
O=S1(=O)CCCC1,0.714,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.286,O,0,O,0,O,0,O,0,1.699
|
| 55 |
+
C1C(OC(=O)O1)F,0.149,COC(=O)OCCF,0.178,C(C(F)(F)F)OC(C(F)F)(F)F,0.564,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.108,O,0,O,0,1.735
|
| 56 |
+
O=S1(=O)CCCC1,0.75,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.25,O,0,O,0,O,0,O,0,1.745
|
| 57 |
+
COC(=O)OC,0.29,C(C(F)(F)F)OCC(F)(F)F,0.589,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0,O,0,O,0,1.745
|
| 58 |
+
COCCOC,0.677,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.323,O,0,O,0,O,0,O,0,1.745
|
| 59 |
+
O1C(C)CCC1,0.908,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.092,O,0,O,0,O,0,O,0,1.745
|
| 60 |
+
COC(=O)OC,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.4,O,0,O,0,O,0,O,0,1.77
|
| 61 |
+
COCCOC,0.371,O1CCOC1,0.552,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.031,[Li+].[N+](=O)([O-])[O-],0.046,O,0,O,0,1.78
|
| 62 |
+
CC1CCC(C)O1,0.893,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.107,O,0,O,0,O,0,O,0,1.796
|
| 63 |
+
C1COC(=O)O1,0.05,CCOC(=O)OC,0.237,C(C(F)(F)F)OCC(F)(F)F,0.575,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.123,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.015,O,0,1.824
|
| 64 |
+
O=S1(=O)CCCC1,0.758,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.235,[Li+].[N+](=O)([O-])[O-],0.007,O,0,O,0,O,0,1.824
|
| 65 |
+
O1CCOC1,0.463,COCCOC,0.312,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.194,[Li+].[N+](=O)([O-])[O-],0.03,O,0,O,0,1.824
|
| 66 |
+
O1CCOC1,0.539,COCCOC,0.363,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.075,[Li+].[N+](=O)([O-])[O-],0.023,O,0,O,0,1.824
|
| 67 |
+
COC(=O)OC,0.375,FC(F)C(F)(F)COC(F)(F)C(F)F,0.375,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.25,O,0,O,0,O,0,1.854
|
| 68 |
+
O1CCOC1,0.478,COCCOC,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.134,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.067,O,0,O,0,1.854
|
| 69 |
+
CC#N,0.621,C1=COC(=O)O1,0.056,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.323,O,0,O,0,O,0,1.854
|
| 70 |
+
COC(=O)OC,0.478,FC(F)C(F)(F)COC(F)(F)C(F)F,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.067,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.134,O,0,O,0,1.886
|
| 71 |
+
COC(=O)OC,0.664,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.336,O,0,O,0,O,0,O,0,1.886
|
| 72 |
+
O1CCOC1,0.478,COCCOC,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.2,O,0,O,0,O,0,1.903
|
| 73 |
+
O=S1(=O)CCCC1,0.429,FC(F)C(F)(F)COC(F)(F)C(F)F,0.429,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.143,O,0,O,0,O,0,1.921
|
| 74 |
+
C1C(OC(=O)O1)F,0.312,O=C1OCCC1,0.599,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.068,[Li+].[N+](=O)([O-])[O-],0.021,O,0,O,0,1.921
|
| 75 |
+
CC1COC(=O)O1,0.595,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.405,O,0,O,0,O,0,O,0,1.921
|
| 76 |
+
O1CCOC1,0.371,COCCOC,0.552,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.077,O,0,O,0,O,0,1.959
|
| 77 |
+
CCOCC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0,O,0,O,0,O,0,1.959
|
| 78 |
+
C1CCOC1,0.925,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.075,O,0,O,0,O,0,O,0,1.377
|
| 79 |
+
C1COC(=O)O1,0.425,O=C(OCC)OCC,0.234,[Li+].F[P-](F)(F)(F)(F)F,0.34,O,0,O,0,O,0,1.398
|
| 80 |
+
C1C(OC(=O)O1)F,0.932,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.068,O,0,O,0,O,0,O,0,1.41
|
| 81 |
+
COCCOC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.047,[Li+].FP(F)(=O)([O-]),0.047,O,0,O,0,O,0,1.444
|
| 82 |
+
O=S1(=O)CCCC1,0.764,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.236,O,0,O,0,O,0,O,0,1.456
|
| 83 |
+
O=C(OCC)C,0.105,ClCCl,0.64,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.255,O,0,O,0,O,0,1.456
|
| 84 |
+
C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.001,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.488
|
| 85 |
+
C1C(OC(=O)O1)F,0.873,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.127,O,0,O,0,O,0,O,0,1.489
|
| 86 |
+
COCCOC,0.231,C(C(F)(F)F)OC(=O)OCC(F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,1.495
|
| 87 |
+
C1C(OC(=O)O1)F,0.495,COC(=O)OC,0.429,O1CCOCCOCCOCC1,0.003,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.498
|
| 88 |
+
C1C(OC(=O)O1)F,0.481,O=C(OCC)OCC,0.432,[Li+].F[P-](F)(F)(F)(F)F,0.087,O,0,O,0,O,0,1.523
|
| 89 |
+
O1CCOC1,0.322,COCCOC,0.478,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.2,O,0,O,0,O,0,1.523
|
| 90 |
+
O1CCOC1,0.552,COCCOC,0.371,[Li+].[N+](=O)([O-])[O-],0.039,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.039,O,0,O,0,1.523
|
| 91 |
+
C1COC(=O)O1,0.444,C1COS(=O)O1,0.497,[Li+].[O-]Cl(=O)(=O)=O,0.059,O,0,O,0,O,0,1.523
|
| 92 |
+
C1C(OC(=O)O1)F,0.105,C1COC(=O)O1,0.518,O=C(OCC)OCC,0.285,[Li+].F[P-](F)(F)(F)(F)F,0.077,[Rb+].[O-][N+]([O-])=O,0.008,O1CCOCCOCCOCCOCCOCC1,0.008,1.538
|
| 93 |
+
C1C(OC(=O)O1)F,0.82,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.18,O,0,O,0,O,0,O,0,1.544
|
| 94 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.431,O1CCOCCOCCOCC1,0,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.559
|
| 95 |
+
COCCOC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0,O,0,O,0,O,0,1.561
|
| 96 |
+
CCOP(=O)(OCC)OCC,0.648,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.352,O,0,O,0,O,0,O,0,1.569
|
| 97 |
+
O=S1(=O)CCCC1,0.25,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.75,O,0,O,0,O,0,O,0,1.569
|
| 98 |
+
C1C(OC(=O)O1)F,0.774,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.226,O,0,O,0,O,0,O,0,1.587
|
| 99 |
+
C1C(OC(=O)O1)F,0.413,O=C(OCC)OCC,0.497,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.09,O,0,O,0,O,0,1.59
|
| 100 |
+
C1COC(=O)O1,0.425,O=C(OCC)OCC(F)(F)F,0.481,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0,O,0,O,0,1.602
|
| 101 |
+
CC1COC(=O)O1,0.702,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.298,O,0,O,0,O,0,O,0,1.602
|
| 102 |
+
O1CCOCC1,0.912,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.088,O,0,O,0,O,0,O,0,1.602
|
| 103 |
+
C1C(OC(=O)O1)F,0.62,C(C(F)(F)F)OC(=O)OCC(F)(F)F,0.291,[Li+].F[P-](F)(F)(F)(F)F,0.089,O,0,O,0,O,0,1.62
|
| 104 |
+
COC(=O)CC(F)(F)F,0.768,C1C(OC(=O)O1)F,0.134,[Li+].F[P-](F)(F)(F)(F)F,0.098,O,0,O,0,O,0,1.62
|
| 105 |
+
C1C(OC(=O)O1)F,0.733,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.267,O,0,O,0,O,0,O,0,1.629
|
| 106 |
+
C1C(OC(=O)O1)F,0.696,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.304,O,0,O,0,O,0,O,0,1.633
|
| 107 |
+
COC(C)C(C)OC,0.879,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0,O,0,O,0,O,0,1.638
|
| 108 |
+
C1COC(=O)O1,0.197,COC(=O)OC,0.156,COCCOCCOCCOCCOC,0.59,[Li+].F[P-](F)(F)(F)(F)F,0.026,[Li+].[N+](=O)([O-])[O-],0.031,O,0,1.638
|
| 109 |
+
C1COC(=O)O1,0.338,COC(=O)OC,0.625,[Li+].[O-]P(=O)(F)F,0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.03,O,0,O,0,1.26
|
| 110 |
+
COCCOC,0.707,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.147,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.147,O,0,O,0,O,0,1.268
|
| 111 |
+
C1COC(=O)O1,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.276
|
| 112 |
+
COCCOC,0.763,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.174,[Li+].[O-]P(=O)(F)F,0.063,O,0,O,0,O,0,1.292
|
| 113 |
+
C1COC(=O)O1,0.563,O=C(OCC)OCC,0.31,C1C(OC(=O)O1)F,0.052,[Li+].F[P-](F)(F)(F)(F)F,0.075,O,0,O,0,1.301
|
| 114 |
+
COCCOCCOCC(F)(F)OC(F)(F)OC(F)(F)COCCOCCOC,0.708,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.292,O,0,O,0,O,0,O,0,1.301
|
| 115 |
+
C1COC(=O)O1,0.331,O=C(OCC)OCC,0.577,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.092,O,0,O,0,O,0,1.301
|
| 116 |
+
C1C(OC(=O)O1)F,0.318,CCOC(=O)OC,0.504,COC(=O)OC,0.094,[Li+].F[P-](F)(F)(F)(F)F,0.083,O,0,O,0,1.301
|
| 117 |
+
C1COC(=O)O1,0.519,O=C(OCC)OCC,0.387,[Li+].F[P-](F)(F)(F)(F)F,0.082,[Li+].[O-]P(=O)(F)F,0.012,O,0,O,0,1.319
|
| 118 |
+
C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.333
|
| 119 |
+
COCCOC,0.259,B(OCC(F)(F)F)(OCC(F)(F)F)OCC(F)(F)F,0.556,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.185,O,0,O,0,O,0,1.337
|
| 120 |
+
C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.355
|
| 121 |
+
COCCOC,0.763,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.237,O,0,O,0,O,0,O,0,1.367
|
| 122 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.001,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.373
|
| 123 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].F[B-](F)(F)F,0.069,O,0,O,0,0.699
|
| 124 |
+
CC1COC(=O)O1,0.922,[Li+].F[B-](F)(F)OC(C(F)(F)(F))(C(F)(F)(F))C(F)(F)(F),0.078,O,0,O,0,O,0,O,0,0.712
|
| 125 |
+
C1COC(=O)O1,0.3,CCOC(=O)OC,0.593,C1=COC(=O)O1,0.026,[Li+].F[P-](F)(F)(F)(F)F,0.081,O,0,O,0,0.745
|
| 126 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.069,O,0,O,0,0.745
|
| 127 |
+
C1COC(=O)O1,0.326,COC(=O)OC,0.602,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,O,0,0.777
|
| 128 |
+
C1COC(=O)O1,0.362,O=C(OCC)OCC,0.548,[Li+].F[P-](F)(F)(F)(F)F,0.09,O,0,O,0,O,0,0.788
|
| 129 |
+
CC1COC(=O)O1,0.887,[Li+].F[As-](F)(F)(F)(F)F,0.113,O,0,O,0,O,0,O,0,0.824
|
| 130 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].C(C(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(C(F)(F)F)(F)F)(F)(F)F,0.069,O,0,O,0,0.854
|
| 131 |
+
C1COC(=O)O1,0.359,COC(=O)OC,0.569,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,O,0,0.854
|
| 132 |
+
C1COC(=O)O1,0.331,O=C(OCC)OCC,0.577,[Li+].F[P-](F)(F)(F)(F)F,0.092,O,0,O,0,O,0,0.886
|
| 133 |
+
C1COC(=O)O1,0.594,O=C(OCC)OCC,0.327,[Li+].F[P-](F)(F)(F)(F)F,0.079,O,0,O,0,O,0,0.921
|
| 134 |
+
C1COC(=O)O1,0.5,CCOC(=O)OC,0.423,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.046,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.031,O,0,O,0,0.924
|
| 135 |
+
C1COC(=O)O1,0.526,O=C(OCC)OCC,0.392,[Li+].F[P-](F)(F)(F)(F)F,0.083,O,0,O,0,O,0,1.013
|
| 136 |
+
C1COC(=O)O1,0.356,COC(=O)OC,0.566,FC(F)(F)COB(OCC(F)(F)F)OCC(F)(F)F,0.007,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.046
|
| 137 |
+
C1COC(=O)O1,0.682,CCOC(=O)OC,0.247,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.043,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.028,O,0,O,0,1.076
|
| 138 |
+
C1COC(=O)O1,0.854,CCOC(=O)OC,0.08,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.039,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.026,O,0,O,0,1.081
|
| 139 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.431,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,O,0,1.085
|
| 140 |
+
C1C(OC(=O)O1)F,0.107,C1COC(=O)O1,0.526,O=C(OCC)OCC,0.289,[Li+].F[P-](F)(F)(F)(F)F,0.078,O,0,O,0,1.108
|
| 141 |
+
C1COC(=O)O1,0.496,COC(=O)OC,0.393,C1C(OC(=O)O1)F,0.045,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.066,O,0,O,0,1.108
|
| 142 |
+
COCCOC,0.73,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.27,O,0,O,0,O,0,O,0,1.143
|
| 143 |
+
C1COC(=O)O1,0.327,O=C(OCC)OCC,0.594,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.079,O,0,O,0,O,0,1.155
|
| 144 |
+
C1COC(=O)O1,0.338,COC(=O)OC,0.625,[Li+].[O-]P(=O)(F)F,0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.03,O,0,O,0,1.194
|
| 145 |
+
COCCOC,0.731,[Li+].[O-]P(=O)(F)F,0.064,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.205,O,0,O,0,O,0,1.215
|
| 146 |
+
COCCOCCOCCOCCOC,0.819,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.181,O,0,O,0,O,0,O,0,1.222
|
| 147 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.225
|
| 148 |
+
COCCOC,0.706,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.008,[Li+].[O-]P(=O)(F)F,0.286,O,0,O,0,O,0,1.244
|
models/.DS_Store
CHANGED
|
Binary files a/models/.DS_Store and b/models/.DS_Store differ
|
|
|
models/.gitattributes
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.csv filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
models/__pycache__/fm4m.cpython-310.pyc
CHANGED
|
Binary files a/models/__pycache__/fm4m.cpython-310.pyc and b/models/__pycache__/fm4m.cpython-310.pyc differ
|
|
|
models/fm4m.py
CHANGED
|
@@ -25,9 +25,17 @@ from sklearn.preprocessing import MinMaxScaler
|
|
| 25 |
import torch
|
| 26 |
from transformers import AutoTokenizer, AutoModel
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
datasets = {}
|
| 33 |
models = {}
|
|
@@ -48,7 +56,7 @@ def avail_models_data():
|
|
| 48 |
|
| 49 |
|
| 50 |
models = [{"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality", "Timestamp": "2024-06-21 12:32:20"},
|
| 51 |
-
{"Name": "mol-xl","Model Name": "
|
| 52 |
{"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model", "Timestamp": "2024-07-10 00:09:42"},
|
| 53 |
{"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model", "Timestamp": "2024-07-10 00:09:42"}]
|
| 54 |
|
|
@@ -58,8 +66,10 @@ def avail_models(raw=False):
|
|
| 58 |
|
| 59 |
models = [{"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model"},
|
| 60 |
{"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality"},
|
| 61 |
-
{"Name": "mol-xl","Model Name": "
|
| 62 |
{"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model"},
|
|
|
|
|
|
|
| 63 |
]
|
| 64 |
|
| 65 |
|
|
@@ -70,12 +80,22 @@ def avail_models(raw=False):
|
|
| 70 |
|
| 71 |
return models
|
| 72 |
|
| 73 |
-
def avail_downstream_models():
|
| 74 |
global downstream_models
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
def avail_datasets():
|
| 81 |
global datasets
|
|
@@ -178,13 +198,15 @@ def update_downstream_model_list(list_model):
|
|
| 178 |
|
| 179 |
avail_models_data()
|
| 180 |
|
|
|
|
|
|
|
| 181 |
def get_representation(train_data,test_data,model_type, return_tensor=True):
|
| 182 |
alias = {"MHG-GED": "mhg", "SELFIES-TED": "bart", "MolFormer": "mol-xl", "Molformer": "mol-xl", "SMI-TED": "smi-ted"}
|
| 183 |
if model_type in alias.keys():
|
| 184 |
model_type = alias[model_type]
|
| 185 |
|
| 186 |
if model_type == "mhg":
|
| 187 |
-
model = mhg.load("models/mhg_model/pickles/mhggnn_pretrained_model_0724_2023.pickle")
|
| 188 |
with torch.no_grad():
|
| 189 |
train_emb = model.encode(train_data)
|
| 190 |
x_batch = torch.stack(train_emb)
|
|
@@ -196,7 +218,6 @@ def get_representation(train_data,test_data,model_type, return_tensor=True):
|
|
| 196 |
x_batch_test = pd.DataFrame(x_batch_test)
|
| 197 |
|
| 198 |
|
| 199 |
-
|
| 200 |
elif model_type == "bart":
|
| 201 |
model = bart()
|
| 202 |
model.load()
|
|
@@ -204,7 +225,7 @@ def get_representation(train_data,test_data,model_type, return_tensor=True):
|
|
| 204 |
x_batch_test = model.encode(test_data, return_tensor=return_tensor)
|
| 205 |
|
| 206 |
elif model_type == "smi-ted":
|
| 207 |
-
model = load_smi_ted(folder='
|
| 208 |
with torch.no_grad():
|
| 209 |
x_batch = model.encode(train_data, return_torch=return_tensor)
|
| 210 |
x_batch_test = model.encode(test_data, return_torch=return_tensor)
|
|
@@ -237,35 +258,78 @@ def get_representation(train_data,test_data,model_type, return_tensor=True):
|
|
| 237 |
if not return_tensor:
|
| 238 |
x_batch = pd.DataFrame(x_batch)
|
| 239 |
x_batch_test = pd.DataFrame(x_batch_test)
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
return x_batch, x_batch_test
|
| 243 |
|
| 244 |
-
def single_modal(model,dataset, downstream_model,params):
|
| 245 |
print(model)
|
| 246 |
-
alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "SMI-TED": "smi-ted"}
|
| 247 |
data = avail_models(raw=True)
|
| 248 |
df = pd.DataFrame(data)
|
| 249 |
-
print(list(df["Name"].values))
|
| 250 |
-
|
| 251 |
-
|
|
|
|
|
|
|
| 252 |
model_type = alias[model]
|
| 253 |
-
else:
|
| 254 |
-
model_type = model
|
| 255 |
else:
|
| 256 |
print("Model not available")
|
| 257 |
return
|
|
|
|
| 258 |
|
| 259 |
data = avail_datasets()
|
| 260 |
df = pd.DataFrame(data)
|
| 261 |
-
print(list(df["Dataset"].values))
|
| 262 |
|
| 263 |
if dataset in list(df["Dataset"].values):
|
| 264 |
task = dataset
|
| 265 |
-
with open(f"
|
| 266 |
x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
|
| 267 |
print(f" Representation loaded successfully")
|
| 268 |
-
|
|
|
|
| 269 |
|
| 270 |
print("Custom Dataset")
|
| 271 |
#return
|
|
@@ -283,14 +347,40 @@ def single_modal(model,dataset, downstream_model,params):
|
|
| 283 |
|
| 284 |
print(f" Representation loaded successfully")
|
| 285 |
|
|
|
|
| 286 |
|
| 287 |
-
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
|
| 290 |
print(f" Calculating ROC AUC Score ...")
|
| 291 |
|
| 292 |
if downstream_model == "XGBClassifier":
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
| 294 |
xgb_predict_concat.fit(x_batch, y_batch)
|
| 295 |
|
| 296 |
y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
|
|
@@ -300,21 +390,26 @@ def single_modal(model,dataset, downstream_model,params):
|
|
| 300 |
print(f"ROC-AUC Score: {roc_auc:.4f}")
|
| 301 |
|
| 302 |
try:
|
| 303 |
-
with open(f"
|
| 304 |
class_0,class_1 = pickle.load(f1)
|
| 305 |
except:
|
| 306 |
print("Generating latent plots")
|
| 307 |
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
|
| 308 |
verbose=False)
|
| 309 |
n_samples = np.minimum(1000, len(x_batch))
|
| 310 |
-
|
| 311 |
try:x = y_batch.values[:n_samples]
|
| 312 |
-
except:x = y_batch[:n_samples]
|
| 313 |
index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 314 |
index_1 = [index for index in range(len(x)) if x[index] == 1]
|
| 315 |
|
| 316 |
-
|
| 317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
print("Generating latent plots : Done")
|
| 319 |
|
| 320 |
#vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
|
|
@@ -334,20 +429,29 @@ def single_modal(model,dataset, downstream_model,params):
|
|
| 334 |
print(f"ROC-AUC Score: {roc_auc:.4f}")
|
| 335 |
|
| 336 |
try:
|
| 337 |
-
with open(f"
|
| 338 |
class_0,class_1 = pickle.load(f1)
|
| 339 |
except:
|
| 340 |
print("Generating latent plots")
|
| 341 |
reducer = umap.UMAP(metric='euclidean', n_neighbors= 10, n_components=2, low_memory=True, min_dist=0.1, verbose=False)
|
| 342 |
n_samples = np.minimum(1000,len(x_batch))
|
| 343 |
-
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
| 344 |
-
try:x = y_batch.values[:n_samples]
|
| 345 |
-
except:x = y_batch[:n_samples]
|
| 346 |
-
index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 347 |
-
index_1 = [index for index in range(len(x)) if x[index] == 1]
|
| 348 |
|
| 349 |
-
|
| 350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
print("Generating latent plots : Done")
|
| 352 |
|
| 353 |
#vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
|
|
@@ -355,16 +459,19 @@ def single_modal(model,dataset, downstream_model,params):
|
|
| 355 |
result = f"ROC-AUC Score: {roc_auc:.4f}"
|
| 356 |
|
| 357 |
return result, roc_auc,fpr, tpr, class_0, class_1
|
| 358 |
-
|
| 359 |
elif downstream_model == "SVR":
|
| 360 |
-
|
|
|
|
|
|
|
|
|
|
| 361 |
model = TransformedTargetRegressor(regressor= regressor,
|
| 362 |
transformer = MinMaxScaler(feature_range=(-1, 1))
|
| 363 |
).fit(x_batch,y_batch)
|
| 364 |
-
|
| 365 |
y_prob = model.predict(x_batch_test)
|
| 366 |
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
|
| 367 |
-
|
| 368 |
print(f"RMSE Score: {RMSE_score:.4f}")
|
| 369 |
result = f"RMSE Score: {RMSE_score:.4f}"
|
| 370 |
|
|
@@ -372,20 +479,28 @@ def single_modal(model,dataset, downstream_model,params):
|
|
| 372 |
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
|
| 373 |
verbose=False)
|
| 374 |
n_samples = np.minimum(1000, len(x_batch))
|
| 375 |
-
|
| 376 |
-
try:x = y_batch.values[:n_samples]
|
| 377 |
-
except:x = y_batch[:n_samples]
|
| 378 |
#index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 379 |
#index_1 = [index for index in range(len(x)) if x[index] == 1]
|
| 380 |
|
| 381 |
-
|
| 382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
print("Generating latent plots : Done")
|
| 384 |
-
|
| 385 |
return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
|
| 386 |
|
| 387 |
elif downstream_model == "Kernel Ridge":
|
| 388 |
-
|
|
|
|
|
|
|
|
|
|
| 389 |
model = TransformedTargetRegressor(regressor=regressor,
|
| 390 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
| 391 |
).fit(x_batch, y_batch)
|
|
@@ -401,8 +516,8 @@ def single_modal(model,dataset, downstream_model,params):
|
|
| 401 |
verbose=False)
|
| 402 |
n_samples = np.minimum(1000, len(x_batch))
|
| 403 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
| 404 |
-
try:x = y_batch.values[:n_samples]
|
| 405 |
-
except:x = y_batch[:n_samples]
|
| 406 |
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 407 |
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
| 408 |
|
|
@@ -414,7 +529,10 @@ def single_modal(model,dataset, downstream_model,params):
|
|
| 414 |
|
| 415 |
|
| 416 |
elif downstream_model == "Linear Regression":
|
| 417 |
-
|
|
|
|
|
|
|
|
|
|
| 418 |
model = TransformedTargetRegressor(regressor=regressor,
|
| 419 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
| 420 |
).fit(x_batch, y_batch)
|
|
@@ -431,7 +549,7 @@ def single_modal(model,dataset, downstream_model,params):
|
|
| 431 |
n_samples = np.minimum(1000, len(x_batch))
|
| 432 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
| 433 |
try:x = y_batch.values[:n_samples]
|
| 434 |
-
except:x = y_batch[:n_samples]
|
| 435 |
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 436 |
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
| 437 |
|
|
@@ -460,7 +578,7 @@ def single_modal(model,dataset, downstream_model,params):
|
|
| 460 |
n_samples = np.minimum(1000, len(x_batch))
|
| 461 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
| 462 |
try:x = y_batch.values[:n_samples]
|
| 463 |
-
except:x = y_batch[:n_samples]
|
| 464 |
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 465 |
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
| 466 |
|
|
@@ -469,10 +587,10 @@ def single_modal(model,dataset, downstream_model,params):
|
|
| 469 |
print("Generating latent plots : Done")
|
| 470 |
|
| 471 |
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
|
|
|
|
| 472 |
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
print(model_list)
|
| 476 |
data = avail_datasets()
|
| 477 |
df = pd.DataFrame(data)
|
| 478 |
list(df["Dataset"].values)
|
|
@@ -480,7 +598,7 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
| 480 |
if dataset in list(df["Dataset"].values):
|
| 481 |
task = dataset
|
| 482 |
predefined = True
|
| 483 |
-
|
| 484 |
predefined = False
|
| 485 |
components = dataset.split(",")
|
| 486 |
train_data = pd.read_csv(components[0])[components[2]]
|
|
@@ -490,13 +608,18 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
| 490 |
y_batch_test = pd.read_csv(components[1])[components[3]]
|
| 491 |
|
| 492 |
print("Custom Dataset loaded")
|
| 493 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
|
| 495 |
data = avail_models(raw=True)
|
| 496 |
df = pd.DataFrame(data)
|
| 497 |
list(df["Name"].values)
|
| 498 |
|
| 499 |
-
alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "SMI-TED":"smi-ted"}
|
| 500 |
#if set(model_list).issubset(list(df["Name"].values)):
|
| 501 |
if set(model_list).issubset(list(alias.keys())):
|
| 502 |
for i, model in enumerate(model_list):
|
|
@@ -507,7 +630,7 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
| 507 |
|
| 508 |
if i == 0:
|
| 509 |
if predefined:
|
| 510 |
-
with open(f"
|
| 511 |
x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
|
| 512 |
print(f" Loaded representation/{task}_{model_type}.pkl")
|
| 513 |
else:
|
|
@@ -517,7 +640,7 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
| 517 |
|
| 518 |
else:
|
| 519 |
if predefined:
|
| 520 |
-
with open(f"
|
| 521 |
x_batch_1, y_batch_1, x_batch_test_1, y_batch_test_1 = pickle.load(f1)
|
| 522 |
print(f" Loaded representation/{task}_{model_type}.pkl")
|
| 523 |
else:
|
|
@@ -528,7 +651,6 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
| 528 |
x_batch = pd.concat([x_batch, x_batch_1], axis=1)
|
| 529 |
x_batch_test = pd.concat([x_batch_test, x_batch_test_1], axis=1)
|
| 530 |
|
| 531 |
-
|
| 532 |
else:
|
| 533 |
print("Model not available")
|
| 534 |
return
|
|
@@ -538,11 +660,31 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
| 538 |
|
| 539 |
num_columns = x_batch.shape[1]
|
| 540 |
x_batch.columns = [f'{i + 1}' for i in range(num_columns)]
|
| 541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
|
| 543 |
print(f"Representations loaded successfully")
|
| 544 |
try:
|
| 545 |
-
with open(f"
|
| 546 |
class_0, class_1 = pickle.load(f1)
|
| 547 |
except:
|
| 548 |
print("Generating latent plots")
|
|
@@ -552,7 +694,7 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
| 552 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
| 553 |
|
| 554 |
if "Classifier" in downstream_model:
|
| 555 |
-
try:x = y_batch.values[:n_samples]
|
| 556 |
except: x = y_batch[:n_samples]
|
| 557 |
index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 558 |
index_1 = [index for index in range(len(x)) if x[index] == 1]
|
|
@@ -570,7 +712,10 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
| 570 |
|
| 571 |
|
| 572 |
if downstream_model == "XGBClassifier":
|
| 573 |
-
|
|
|
|
|
|
|
|
|
|
| 574 |
xgb_predict_concat.fit(x_batch, y_batch)
|
| 575 |
|
| 576 |
y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
|
|
@@ -608,21 +753,27 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
| 608 |
return result, roc_auc,fpr, tpr, class_0, class_1
|
| 609 |
|
| 610 |
elif downstream_model == "SVR":
|
| 611 |
-
|
|
|
|
|
|
|
|
|
|
| 612 |
model = TransformedTargetRegressor(regressor= regressor,
|
| 613 |
transformer = MinMaxScaler(feature_range=(-1, 1))
|
| 614 |
).fit(x_batch,y_batch)
|
| 615 |
-
|
| 616 |
y_prob = model.predict(x_batch_test)
|
| 617 |
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
|
| 618 |
-
|
| 619 |
print(f"RMSE Score: {RMSE_score:.4f}")
|
| 620 |
result = f"RMSE Score: {RMSE_score:.4f}"
|
| 621 |
-
|
| 622 |
return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
|
| 623 |
|
| 624 |
elif downstream_model == "Linear Regression":
|
| 625 |
-
|
|
|
|
|
|
|
|
|
|
| 626 |
model = TransformedTargetRegressor(regressor=regressor,
|
| 627 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
| 628 |
).fit(x_batch, y_batch)
|
|
@@ -636,7 +787,10 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
| 636 |
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
|
| 637 |
|
| 638 |
elif downstream_model == "Kernel Ridge":
|
| 639 |
-
|
|
|
|
|
|
|
|
|
|
| 640 |
model = TransformedTargetRegressor(regressor=regressor,
|
| 641 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
| 642 |
).fit(x_batch, y_batch)
|
|
@@ -665,6 +819,144 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
| 665 |
|
| 666 |
|
| 667 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
|
| 669 |
|
| 670 |
|
|
|
|
| 25 |
import torch
|
| 26 |
from transformers import AutoTokenizer, AutoModel
|
| 27 |
|
| 28 |
+
import sys
|
| 29 |
+
sys.path.append("models/")
|
| 30 |
+
|
| 31 |
+
from models.selfies_ted.load import SELFIES as bart
|
| 32 |
+
from models.mhg_model import load as mhg
|
| 33 |
+
from models.smi_ted.smi_ted_light.load import load_smi_ted
|
| 34 |
+
|
| 35 |
+
import mordred
|
| 36 |
+
from mordred import Calculator, descriptors
|
| 37 |
+
from rdkit import Chem
|
| 38 |
+
from rdkit.Chem import AllChem
|
| 39 |
|
| 40 |
datasets = {}
|
| 41 |
models = {}
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
models = [{"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality", "Timestamp": "2024-06-21 12:32:20"},
|
| 59 |
+
{"Name": "mol-xl","Model Name": "MolFormer", "Description": "MolFormer model for string based SMILES modality", "Timestamp": "2024-06-21 12:35:56"},
|
| 60 |
{"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model", "Timestamp": "2024-07-10 00:09:42"},
|
| 61 |
{"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model", "Timestamp": "2024-07-10 00:09:42"}]
|
| 62 |
|
|
|
|
| 66 |
|
| 67 |
models = [{"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model"},
|
| 68 |
{"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality"},
|
| 69 |
+
{"Name": "mol-xl","Model Name": "MolFormer", "Description": "MolFormer model for string based SMILES modality"},
|
| 70 |
{"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model"},
|
| 71 |
+
{"Name": "Mordred", "Model Name": "Mordred","Description": "Baseline: A descriptor-calculation software application that can calculate more than 1800 two- and three-dimensional descriptors"},
|
| 72 |
+
{"Name": "MorganFingerprint", "Model Name": "MorganFingerprint","Description": "Baseline: Circular atom environments based descriptor"}
|
| 73 |
]
|
| 74 |
|
| 75 |
|
|
|
|
| 80 |
|
| 81 |
return models
|
| 82 |
|
| 83 |
+
def avail_downstream_models(raw=False):
|
| 84 |
global downstream_models
|
| 85 |
|
| 86 |
+
downstream_models = [{"Name": "XGBClassifier", "Task Type": "Classfication"},
|
| 87 |
+
{"Name": "DefaultClassifier", "Task Type": "Classfication"},
|
| 88 |
+
{"Name": "SVR", "Task Type": "Regression"},
|
| 89 |
+
{"Name": "Kernel Ridge", "Task Type": "Regression"},
|
| 90 |
+
{"Name": "Linear Regression", "Task Type": "Regression"},
|
| 91 |
+
{"Name": "DefaultRegressor", "Task Type": "Regression"},
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
if raw: return downstream_models
|
| 95 |
+
else:
|
| 96 |
+
return pd.DataFrame(downstream_models)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
|
| 100 |
def avail_datasets():
|
| 101 |
global datasets
|
|
|
|
| 198 |
|
| 199 |
avail_models_data()
|
| 200 |
|
| 201 |
+
|
| 202 |
+
|
| 203 |
def get_representation(train_data,test_data,model_type, return_tensor=True):
|
| 204 |
alias = {"MHG-GED": "mhg", "SELFIES-TED": "bart", "MolFormer": "mol-xl", "Molformer": "mol-xl", "SMI-TED": "smi-ted"}
|
| 205 |
if model_type in alias.keys():
|
| 206 |
model_type = alias[model_type]
|
| 207 |
|
| 208 |
if model_type == "mhg":
|
| 209 |
+
model = mhg.load("../models/mhg_model/pickles/mhggnn_pretrained_model_0724_2023.pickle")
|
| 210 |
with torch.no_grad():
|
| 211 |
train_emb = model.encode(train_data)
|
| 212 |
x_batch = torch.stack(train_emb)
|
|
|
|
| 218 |
x_batch_test = pd.DataFrame(x_batch_test)
|
| 219 |
|
| 220 |
|
|
|
|
| 221 |
elif model_type == "bart":
|
| 222 |
model = bart()
|
| 223 |
model.load()
|
|
|
|
| 225 |
x_batch_test = model.encode(test_data, return_tensor=return_tensor)
|
| 226 |
|
| 227 |
elif model_type == "smi-ted":
|
| 228 |
+
model = load_smi_ted(folder='../models/smi_ted/smi_ted_light', ckpt_filename='smi-ted-Light_40.pt')
|
| 229 |
with torch.no_grad():
|
| 230 |
x_batch = model.encode(train_data, return_torch=return_tensor)
|
| 231 |
x_batch_test = model.encode(test_data, return_torch=return_tensor)
|
|
|
|
| 258 |
if not return_tensor:
|
| 259 |
x_batch = pd.DataFrame(x_batch)
|
| 260 |
x_batch_test = pd.DataFrame(x_batch_test)
|
| 261 |
+
|
| 262 |
+
elif model_type == 'Mordred':
|
| 263 |
+
all_data = train_data + test_data
|
| 264 |
+
calc = Calculator(descriptors, ignore_3D=True)
|
| 265 |
+
mol_list = [Chem.MolFromSmiles(sm) for sm in all_data]
|
| 266 |
+
x_all = calc.pandas(mol_list)
|
| 267 |
+
print (f'original mordred fv dim: {x_all.shape}')
|
| 268 |
+
|
| 269 |
+
for j in x_all.columns:
|
| 270 |
+
for k in range(len(x_all[j])):
|
| 271 |
+
i = x_all.loc[k, j]
|
| 272 |
+
if type(i) is mordred.error.Missing or type(i) is mordred.error.Error:
|
| 273 |
+
x_all.loc[k, j] = np.nan
|
| 274 |
+
|
| 275 |
+
x_all.dropna(how="any", axis = 1, inplace=True)
|
| 276 |
+
print (f'Nan excluded mordred fv dim: {x_all.shape}')
|
| 277 |
+
|
| 278 |
+
x_batch = x_all.iloc[:len(train_data)]
|
| 279 |
+
x_batch_test = x_all.iloc[len(train_data):]
|
| 280 |
+
# print(f'x_batch: {len(x_batch)}, x_batch_test: {len(x_batch_test)}')
|
| 281 |
+
|
| 282 |
+
elif model_type == 'MorganFingerprint':
|
| 283 |
+
params = {'radius':2, 'nBits':1024}
|
| 284 |
+
|
| 285 |
+
mol_train = [Chem.MolFromSmiles(sm) for sm in train_data]
|
| 286 |
+
mol_test = [Chem.MolFromSmiles(sm) for sm in test_data]
|
| 287 |
+
|
| 288 |
+
x_batch = []
|
| 289 |
+
for mol in mol_train:
|
| 290 |
+
info = {}
|
| 291 |
+
fp = AllChem.GetMorganFingerprintAsBitVect(mol, **params, bitInfo=info)
|
| 292 |
+
vector = list(fp)
|
| 293 |
+
x_batch.append(vector)
|
| 294 |
+
x_batch = pd.DataFrame(x_batch)
|
| 295 |
+
|
| 296 |
+
x_batch_test = []
|
| 297 |
+
for mol in mol_test:
|
| 298 |
+
info = {}
|
| 299 |
+
fp = AllChem.GetMorganFingerprintAsBitVect(mol, **params, bitInfo=info)
|
| 300 |
+
vector = list(fp)
|
| 301 |
+
x_batch_test.append(vector)
|
| 302 |
+
x_batch_test = pd.DataFrame(x_batch_test)
|
| 303 |
|
| 304 |
return x_batch, x_batch_test
|
| 305 |
|
| 306 |
+
def single_modal(model,dataset=None, downstream_model=None, params=None, x_train=None, x_test=None, y_train=None, y_test=None):
|
| 307 |
print(model)
|
| 308 |
+
alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "Molformer": "mol-xl", "SMI-TED": "smi-ted"}
|
| 309 |
data = avail_models(raw=True)
|
| 310 |
df = pd.DataFrame(data)
|
| 311 |
+
#print(list(df["Name"].values))
|
| 312 |
+
|
| 313 |
+
if model in list(df["Name"].values):
|
| 314 |
+
model_type = model
|
| 315 |
+
elif alias[model] in list(df["Name"].values):
|
| 316 |
model_type = alias[model]
|
|
|
|
|
|
|
| 317 |
else:
|
| 318 |
print("Model not available")
|
| 319 |
return
|
| 320 |
+
|
| 321 |
|
| 322 |
data = avail_datasets()
|
| 323 |
df = pd.DataFrame(data)
|
| 324 |
+
#print(list(df["Dataset"].values))
|
| 325 |
|
| 326 |
if dataset in list(df["Dataset"].values):
|
| 327 |
task = dataset
|
| 328 |
+
with open(f"representation/{task}_{model_type}.pkl", "rb") as f1:
|
| 329 |
x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
|
| 330 |
print(f" Representation loaded successfully")
|
| 331 |
+
|
| 332 |
+
elif x_train==None:
|
| 333 |
|
| 334 |
print("Custom Dataset")
|
| 335 |
#return
|
|
|
|
| 347 |
|
| 348 |
print(f" Representation loaded successfully")
|
| 349 |
|
| 350 |
+
else:
|
| 351 |
|
| 352 |
+
y_batch = y_train
|
| 353 |
+
y_batch_test = y_test
|
| 354 |
+
x_batch, x_batch_test = get_representation(x_train, x_test, model_type)
|
| 355 |
+
|
| 356 |
+
# exclude row containing Nan value
|
| 357 |
+
if isinstance(x_batch, torch.Tensor):
|
| 358 |
+
x_batch = pd.DataFrame(x_batch)
|
| 359 |
+
nan_indices = x_batch.index[x_batch.isna().any(axis=1)]
|
| 360 |
+
if len(nan_indices) > 0:
|
| 361 |
+
x_batch.dropna(inplace = True)
|
| 362 |
+
for index in sorted(nan_indices, reverse=True):
|
| 363 |
+
del y_batch[index]
|
| 364 |
+
print(f'x_batch Nan index: {nan_indices}')
|
| 365 |
+
print(f'x_batch shape: {x_batch.shape}, y_batch len: {len(y_batch)}')
|
| 366 |
+
|
| 367 |
+
if isinstance(x_batch_test, torch.Tensor):
|
| 368 |
+
x_batch_test = pd.DataFrame(x_batch_test)
|
| 369 |
+
nan_indices = x_batch_test.index[x_batch_test.isna().any(axis=1)]
|
| 370 |
+
if len(nan_indices) > 0:
|
| 371 |
+
x_batch_test.dropna(inplace = True)
|
| 372 |
+
for index in sorted(nan_indices, reverse=True):
|
| 373 |
+
del y_batch_test[index]
|
| 374 |
+
print(f'x_batch_test Nan index: {nan_indices}')
|
| 375 |
+
print(f'x_batch_test shape: {x_batch_test.shape}, y_batch_test len: {len(y_batch_test)}')
|
| 376 |
|
| 377 |
print(f" Calculating ROC AUC Score ...")
|
| 378 |
|
| 379 |
if downstream_model == "XGBClassifier":
|
| 380 |
+
if params == None:
|
| 381 |
+
xgb_predict_concat = XGBClassifier()
|
| 382 |
+
else:
|
| 383 |
+
xgb_predict_concat = XGBClassifier(**params) # n_estimators=5000, learning_rate=0.01, max_depth=10
|
| 384 |
xgb_predict_concat.fit(x_batch, y_batch)
|
| 385 |
|
| 386 |
y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
|
|
|
|
| 390 |
print(f"ROC-AUC Score: {roc_auc:.4f}")
|
| 391 |
|
| 392 |
try:
|
| 393 |
+
with open(f"plot_emb/{task}_{model_type}.pkl", "rb") as f1:
|
| 394 |
class_0,class_1 = pickle.load(f1)
|
| 395 |
except:
|
| 396 |
print("Generating latent plots")
|
| 397 |
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
|
| 398 |
verbose=False)
|
| 399 |
n_samples = np.minimum(1000, len(x_batch))
|
| 400 |
+
|
| 401 |
try:x = y_batch.values[:n_samples]
|
| 402 |
+
except: x = y_batch[:n_samples]
|
| 403 |
index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 404 |
index_1 = [index for index in range(len(x)) if x[index] == 1]
|
| 405 |
|
| 406 |
+
try:
|
| 407 |
+
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
| 408 |
+
class_0 = features_umap[index_0]
|
| 409 |
+
class_1 = features_umap[index_1]
|
| 410 |
+
except:
|
| 411 |
+
class_0 = []
|
| 412 |
+
class_1 = []
|
| 413 |
print("Generating latent plots : Done")
|
| 414 |
|
| 415 |
#vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
|
|
|
|
| 429 |
print(f"ROC-AUC Score: {roc_auc:.4f}")
|
| 430 |
|
| 431 |
try:
|
| 432 |
+
with open(f"plot_emb/{task}_{model_type}.pkl", "rb") as f1:
|
| 433 |
class_0,class_1 = pickle.load(f1)
|
| 434 |
except:
|
| 435 |
print("Generating latent plots")
|
| 436 |
reducer = umap.UMAP(metric='euclidean', n_neighbors= 10, n_components=2, low_memory=True, min_dist=0.1, verbose=False)
|
| 437 |
n_samples = np.minimum(1000,len(x_batch))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
+
try:
|
| 440 |
+
x = y_batch.values[:n_samples]
|
| 441 |
+
except:
|
| 442 |
+
x = y_batch[:n_samples]
|
| 443 |
+
|
| 444 |
+
try:
|
| 445 |
+
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
| 446 |
+
index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 447 |
+
index_1 = [index for index in range(len(x)) if x[index] == 1]
|
| 448 |
+
|
| 449 |
+
class_0 = features_umap[index_0]
|
| 450 |
+
class_1 = features_umap[index_1]
|
| 451 |
+
except:
|
| 452 |
+
class_0 = []
|
| 453 |
+
class_1 = []
|
| 454 |
+
|
| 455 |
print("Generating latent plots : Done")
|
| 456 |
|
| 457 |
#vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
|
|
|
|
| 459 |
result = f"ROC-AUC Score: {roc_auc:.4f}"
|
| 460 |
|
| 461 |
return result, roc_auc,fpr, tpr, class_0, class_1
|
| 462 |
+
|
| 463 |
elif downstream_model == "SVR":
|
| 464 |
+
if params == None:
|
| 465 |
+
regressor = SVR()
|
| 466 |
+
else:
|
| 467 |
+
regressor = SVR(**params)
|
| 468 |
model = TransformedTargetRegressor(regressor= regressor,
|
| 469 |
transformer = MinMaxScaler(feature_range=(-1, 1))
|
| 470 |
).fit(x_batch,y_batch)
|
| 471 |
+
|
| 472 |
y_prob = model.predict(x_batch_test)
|
| 473 |
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
|
| 474 |
+
|
| 475 |
print(f"RMSE Score: {RMSE_score:.4f}")
|
| 476 |
result = f"RMSE Score: {RMSE_score:.4f}"
|
| 477 |
|
|
|
|
| 479 |
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
|
| 480 |
verbose=False)
|
| 481 |
n_samples = np.minimum(1000, len(x_batch))
|
| 482 |
+
|
| 483 |
+
try: x = y_batch.values[:n_samples]
|
| 484 |
+
except: x = y_batch[:n_samples]
|
| 485 |
#index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 486 |
#index_1 = [index for index in range(len(x)) if x[index] == 1]
|
| 487 |
|
| 488 |
+
try:
|
| 489 |
+
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
| 490 |
+
class_0 = features_umap#[index_0]
|
| 491 |
+
class_1 = features_umap#[index_1]
|
| 492 |
+
except:
|
| 493 |
+
class_0 = []
|
| 494 |
+
class_1 = []
|
| 495 |
print("Generating latent plots : Done")
|
| 496 |
+
|
| 497 |
return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
|
| 498 |
|
| 499 |
elif downstream_model == "Kernel Ridge":
|
| 500 |
+
if params == None:
|
| 501 |
+
regressor = KernelRidge()
|
| 502 |
+
else:
|
| 503 |
+
regressor = KernelRidge(**params)
|
| 504 |
model = TransformedTargetRegressor(regressor=regressor,
|
| 505 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
| 506 |
).fit(x_batch, y_batch)
|
|
|
|
| 516 |
verbose=False)
|
| 517 |
n_samples = np.minimum(1000, len(x_batch))
|
| 518 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
| 519 |
+
try: x = y_batch.values[:n_samples]
|
| 520 |
+
except: x = y_batch[:n_samples]
|
| 521 |
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 522 |
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
| 523 |
|
|
|
|
| 529 |
|
| 530 |
|
| 531 |
elif downstream_model == "Linear Regression":
|
| 532 |
+
if params == None:
|
| 533 |
+
regressor = LinearRegression()
|
| 534 |
+
else:
|
| 535 |
+
regressor = LinearRegression(**params)
|
| 536 |
model = TransformedTargetRegressor(regressor=regressor,
|
| 537 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
| 538 |
).fit(x_batch, y_batch)
|
|
|
|
| 549 |
n_samples = np.minimum(1000, len(x_batch))
|
| 550 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
| 551 |
try:x = y_batch.values[:n_samples]
|
| 552 |
+
except: x = y_batch[:n_samples]
|
| 553 |
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 554 |
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
| 555 |
|
|
|
|
| 578 |
n_samples = np.minimum(1000, len(x_batch))
|
| 579 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
| 580 |
try:x = y_batch.values[:n_samples]
|
| 581 |
+
except: x = y_batch[:n_samples]
|
| 582 |
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 583 |
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
| 584 |
|
|
|
|
| 587 |
print("Generating latent plots : Done")
|
| 588 |
|
| 589 |
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
|
| 590 |
+
|
| 591 |
|
| 592 |
+
def multi_modal(model_list,dataset=None, downstream_model=None,params=None, x_train=None, x_test=None, y_train=None, y_test=None):
|
| 593 |
+
#print(model_list)
|
|
|
|
| 594 |
data = avail_datasets()
|
| 595 |
df = pd.DataFrame(data)
|
| 596 |
list(df["Dataset"].values)
|
|
|
|
| 598 |
if dataset in list(df["Dataset"].values):
|
| 599 |
task = dataset
|
| 600 |
predefined = True
|
| 601 |
+
elif x_train==None:
|
| 602 |
predefined = False
|
| 603 |
components = dataset.split(",")
|
| 604 |
train_data = pd.read_csv(components[0])[components[2]]
|
|
|
|
| 608 |
y_batch_test = pd.read_csv(components[1])[components[3]]
|
| 609 |
|
| 610 |
print("Custom Dataset loaded")
|
| 611 |
+
else:
|
| 612 |
+
predefined = False
|
| 613 |
+
y_batch = y_train
|
| 614 |
+
y_batch_test = y_test
|
| 615 |
+
train_data = x_train
|
| 616 |
+
test_data = x_test
|
| 617 |
|
| 618 |
data = avail_models(raw=True)
|
| 619 |
df = pd.DataFrame(data)
|
| 620 |
list(df["Name"].values)
|
| 621 |
|
| 622 |
+
alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "Molformer": "mol-xl","SMI-TED":"smi-ted", "Mordred": "Mordred", "MorganFingerprint": "MorganFingerprint"}
|
| 623 |
#if set(model_list).issubset(list(df["Name"].values)):
|
| 624 |
if set(model_list).issubset(list(alias.keys())):
|
| 625 |
for i, model in enumerate(model_list):
|
|
|
|
| 630 |
|
| 631 |
if i == 0:
|
| 632 |
if predefined:
|
| 633 |
+
with open(f"representation/{task}_{model_type}.pkl", "rb") as f1:
|
| 634 |
x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
|
| 635 |
print(f" Loaded representation/{task}_{model_type}.pkl")
|
| 636 |
else:
|
|
|
|
| 640 |
|
| 641 |
else:
|
| 642 |
if predefined:
|
| 643 |
+
with open(f"representation/{task}_{model_type}.pkl", "rb") as f1:
|
| 644 |
x_batch_1, y_batch_1, x_batch_test_1, y_batch_test_1 = pickle.load(f1)
|
| 645 |
print(f" Loaded representation/{task}_{model_type}.pkl")
|
| 646 |
else:
|
|
|
|
| 651 |
x_batch = pd.concat([x_batch, x_batch_1], axis=1)
|
| 652 |
x_batch_test = pd.concat([x_batch_test, x_batch_test_1], axis=1)
|
| 653 |
|
|
|
|
| 654 |
else:
|
| 655 |
print("Model not available")
|
| 656 |
return
|
|
|
|
| 660 |
|
| 661 |
num_columns = x_batch.shape[1]
|
| 662 |
x_batch.columns = [f'{i + 1}' for i in range(num_columns)]
|
| 663 |
+
|
| 664 |
+
# exclude row containing Nan value
|
| 665 |
+
if isinstance(x_batch, torch.Tensor):
|
| 666 |
+
x_batch = pd.DataFrame(x_batch)
|
| 667 |
+
nan_indices = x_batch.index[x_batch.isna().any(axis=1)]
|
| 668 |
+
if len(nan_indices) > 0:
|
| 669 |
+
x_batch.dropna(inplace = True)
|
| 670 |
+
for index in sorted(nan_indices, reverse=True):
|
| 671 |
+
del y_batch[index]
|
| 672 |
+
print(f'x_batch Nan index: {nan_indices}')
|
| 673 |
+
print(f'x_batch shape: {x_batch.shape}, y_batch len: {len(y_batch)}')
|
| 674 |
+
|
| 675 |
+
if isinstance(x_batch_test, torch.Tensor):
|
| 676 |
+
x_batch_test = pd.DataFrame(x_batch_test)
|
| 677 |
+
nan_indices = x_batch_test.index[x_batch_test.isna().any(axis=1)]
|
| 678 |
+
if len(nan_indices) > 0:
|
| 679 |
+
x_batch_test.dropna(inplace = True)
|
| 680 |
+
for index in sorted(nan_indices, reverse=True):
|
| 681 |
+
del y_batch_test[index]
|
| 682 |
+
print(f'x_batch_test Nan index: {nan_indices}')
|
| 683 |
+
print(f'x_batch_test shape: {x_batch_test.shape}, y_batch_test len: {len(y_batch_test)}')
|
| 684 |
|
| 685 |
print(f"Representations loaded successfully")
|
| 686 |
try:
|
| 687 |
+
with open(f"plot_emb/{task}_multi.pkl", "rb") as f1:
|
| 688 |
class_0, class_1 = pickle.load(f1)
|
| 689 |
except:
|
| 690 |
print("Generating latent plots")
|
|
|
|
| 694 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
| 695 |
|
| 696 |
if "Classifier" in downstream_model:
|
| 697 |
+
try: x = y_batch.values[:n_samples]
|
| 698 |
except: x = y_batch[:n_samples]
|
| 699 |
index_0 = [index for index in range(len(x)) if x[index] == 0]
|
| 700 |
index_1 = [index for index in range(len(x)) if x[index] == 1]
|
|
|
|
| 712 |
|
| 713 |
|
| 714 |
if downstream_model == "XGBClassifier":
|
| 715 |
+
if params == None:
|
| 716 |
+
xgb_predict_concat = XGBClassifier()
|
| 717 |
+
else:
|
| 718 |
+
xgb_predict_concat = XGBClassifier(**params)#n_estimators=5000, learning_rate=0.01, max_depth=10)
|
| 719 |
xgb_predict_concat.fit(x_batch, y_batch)
|
| 720 |
|
| 721 |
y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
|
|
|
|
| 753 |
return result, roc_auc,fpr, tpr, class_0, class_1
|
| 754 |
|
| 755 |
elif downstream_model == "SVR":
|
| 756 |
+
if params == None:
|
| 757 |
+
regressor = SVR()
|
| 758 |
+
else:
|
| 759 |
+
regressor = SVR(**params)
|
| 760 |
model = TransformedTargetRegressor(regressor= regressor,
|
| 761 |
transformer = MinMaxScaler(feature_range=(-1, 1))
|
| 762 |
).fit(x_batch,y_batch)
|
| 763 |
+
|
| 764 |
y_prob = model.predict(x_batch_test)
|
| 765 |
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
|
| 766 |
+
|
| 767 |
print(f"RMSE Score: {RMSE_score:.4f}")
|
| 768 |
result = f"RMSE Score: {RMSE_score:.4f}"
|
| 769 |
+
|
| 770 |
return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
|
| 771 |
|
| 772 |
elif downstream_model == "Linear Regression":
|
| 773 |
+
if params == None:
|
| 774 |
+
regressor = LinearRegression()
|
| 775 |
+
else:
|
| 776 |
+
regressor = LinearRegression(**params)
|
| 777 |
model = TransformedTargetRegressor(regressor=regressor,
|
| 778 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
| 779 |
).fit(x_batch, y_batch)
|
|
|
|
| 787 |
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
|
| 788 |
|
| 789 |
elif downstream_model == "Kernel Ridge":
|
| 790 |
+
if params == None:
|
| 791 |
+
regressor = KernelRidge()
|
| 792 |
+
else:
|
| 793 |
+
regressor = KernelRidge(**params)
|
| 794 |
model = TransformedTargetRegressor(regressor=regressor,
|
| 795 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
| 796 |
).fit(x_batch, y_batch)
|
|
|
|
| 819 |
|
| 820 |
|
| 821 |
|
| 822 |
+
def finetune_optuna(x_batch,y_batch, x_batch_test, y_test ):
|
| 823 |
+
print(f" Finetuning with Optuna and calculating ROC AUC Score ...")
|
| 824 |
+
X_train = x_batch.values
|
| 825 |
+
y_train = y_batch.values
|
| 826 |
+
X_test = x_batch_test.values
|
| 827 |
+
y_test = y_test.values
|
| 828 |
+
def objective(trial):
|
| 829 |
+
# Define parameters to be optimized
|
| 830 |
+
params = {
|
| 831 |
+
# 'objective': 'binary:logistic',
|
| 832 |
+
'eval_metric': 'auc',
|
| 833 |
+
'verbosity': 0,
|
| 834 |
+
'n_estimators': trial.suggest_int('n_estimators', 1000, 10000),
|
| 835 |
+
# 'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
|
| 836 |
+
# 'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
|
| 837 |
+
'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
|
| 838 |
+
'max_depth': trial.suggest_int('max_depth', 1, 12),
|
| 839 |
+
# 'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
|
| 840 |
+
# 'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
|
| 841 |
+
# 'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
|
| 842 |
+
# "subsample": trial.suggest_float("subsample", 0.05, 1.0),
|
| 843 |
+
# "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
|
| 844 |
+
}
|
| 845 |
+
|
| 846 |
+
# Train XGBoost model
|
| 847 |
+
dtrain = xgb.DMatrix(X_train, label=y_train)
|
| 848 |
+
dtest = xgb.DMatrix(X_test, label=y_test)
|
| 849 |
+
|
| 850 |
+
model = xgb.train(params, dtrain)
|
| 851 |
+
|
| 852 |
+
# Predict probabilities
|
| 853 |
+
y_pred = model.predict(dtest)
|
| 854 |
+
|
| 855 |
+
# Calculate ROC AUC score
|
| 856 |
+
roc_auc = roc_auc_score(y_test, y_pred)
|
| 857 |
+
print("ROC_AUC : ", roc_auc)
|
| 858 |
+
|
| 859 |
+
return roc_auc
|
| 860 |
+
|
| 861 |
+
def add_new_model():
|
| 862 |
+
models = avail_models(raw=True)
|
| 863 |
+
|
| 864 |
+
# Function to display models
|
| 865 |
+
def display_models():
|
| 866 |
+
for model in models:
|
| 867 |
+
model_display = f"Name: {model['Name']}, Description: {model['Description']}, Timestamp: {model['Timestamp']}"
|
| 868 |
+
print(model_display)
|
| 869 |
+
|
| 870 |
+
# Function to update models
|
| 871 |
+
def update_models(new_name, new_description, new_path):
|
| 872 |
+
new_model = {
|
| 873 |
+
"Name": new_name,
|
| 874 |
+
"Description": new_description,
|
| 875 |
+
"Timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 876 |
+
#"path": new_path
|
| 877 |
+
}
|
| 878 |
+
models.append(new_model)
|
| 879 |
+
with open("models.json", "w") as outfile:
|
| 880 |
+
json.dump(models, outfile)
|
| 881 |
+
|
| 882 |
+
print("Model uploaded and updated successfully!")
|
| 883 |
+
list_models()
|
| 884 |
+
#display_models()
|
| 885 |
+
|
| 886 |
+
# Widgets
|
| 887 |
+
name_text = widgets.Text(description="Name:", layout=Layout(width='50%'))
|
| 888 |
+
description_text = widgets.Text(description="Description:", layout=Layout(width='50%'))
|
| 889 |
+
path_text = widgets.Text(description="Path:", layout=Layout(width='50%'))
|
| 890 |
+
|
| 891 |
+
def browse_callback(b):
|
| 892 |
+
root = tk.Tk()
|
| 893 |
+
root.withdraw() # Hide the main window
|
| 894 |
+
file_path = filedialog.askopenfilename(title="Select a Model File")
|
| 895 |
+
if file_path:
|
| 896 |
+
path_text.value = file_path
|
| 897 |
+
|
| 898 |
+
browse_button = widgets.Button(description="Browse")
|
| 899 |
+
browse_button.on_click(browse_callback)
|
| 900 |
+
|
| 901 |
+
def submit_callback(b):
|
| 902 |
+
update_models(name_text.value, description_text.value, path_text.value)
|
| 903 |
+
|
| 904 |
+
submit_button = widgets.Button(description="Submit")
|
| 905 |
+
submit_button.on_click(submit_callback)
|
| 906 |
+
|
| 907 |
+
# Display widgets
|
| 908 |
+
display(VBox([name_text, description_text, path_text, browse_button, submit_button]))
|
| 909 |
+
|
| 910 |
+
|
| 911 |
+
def add_new_dataset():
|
| 912 |
+
# Sample data
|
| 913 |
+
datasets = avail_datasets()
|
| 914 |
+
|
| 915 |
+
# Function to display models
|
| 916 |
+
def display_datasets():
|
| 917 |
+
for dataset in datasets:
|
| 918 |
+
dataset_display = f"Name: {dataset['Dataset']}, Input: {dataset['Input']},Output: {dataset['Output']},Path: {dataset['Path']}, Timestamp: {dataset['Timestamp']}"
|
| 919 |
+
|
| 920 |
+
# Function to update models
|
| 921 |
+
def update_datasets(new_dataset, new_input, new_output, new_path):
|
| 922 |
+
new_model = {
|
| 923 |
+
"Dataset": new_dataset,
|
| 924 |
+
"Input": new_input,
|
| 925 |
+
"Output": new_output,
|
| 926 |
+
"Timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 927 |
+
"Path": os.path.basename(new_path)
|
| 928 |
+
}
|
| 929 |
+
datasets.append(new_model)
|
| 930 |
+
with open("datasets.json", "w") as outfile:
|
| 931 |
+
json.dump(datasets, outfile)
|
| 932 |
+
|
| 933 |
+
print("Dataset uploaded and updated successfully!")
|
| 934 |
+
list_data()
|
| 935 |
+
|
| 936 |
+
|
| 937 |
+
# Widgets
|
| 938 |
+
dataset_text = widgets.Text(description="Dataset:", layout=Layout(width='50%'))
|
| 939 |
+
input_text = widgets.Text(description="Input:", layout=Layout(width='50%'))
|
| 940 |
+
output_text = widgets.Text(description="Output:", layout=Layout(width='50%'))
|
| 941 |
+
path_text = widgets.Text(description="Path:", layout=Layout(width='50%'))
|
| 942 |
+
|
| 943 |
+
def browse_callback(b):
|
| 944 |
+
root = tk.Tk()
|
| 945 |
+
root.withdraw() # Hide the main window
|
| 946 |
+
file_path = filedialog.askopenfilename(title="Select a Dataset File")
|
| 947 |
+
if file_path:
|
| 948 |
+
path_text.value = file_path
|
| 949 |
+
|
| 950 |
+
browse_button = widgets.Button(description="Browse")
|
| 951 |
+
browse_button.on_click(browse_callback)
|
| 952 |
+
|
| 953 |
+
def submit_callback(b):
|
| 954 |
+
update_datasets(dataset_text.value, input_text.value, output_text.value, path_text.value)
|
| 955 |
+
|
| 956 |
+
submit_button = widgets.Button(description="Submit")
|
| 957 |
+
submit_button.on_click(submit_callback)
|
| 958 |
+
|
| 959 |
+
display(VBox([dataset_text, input_text, output_text, path_text, browse_button, submit_button]))
|
| 960 |
|
| 961 |
|
| 962 |
|
models/mhg_model/README.md
CHANGED
|
@@ -27,7 +27,7 @@ In addition, the decoder inherits the theoretical guarantee of MHG on always gen
|
|
| 27 |
|
| 28 |
### Pretrained Models and Training Logs
|
| 29 |
|
| 30 |
-
We provide checkpoints of the MHG-GNN model pre-trained on a dataset of ~1.34M molecules curated from PubChem. (later) For model weights: [HuggingFace Link]()
|
| 31 |
|
| 32 |
Add the MHG-GNN `pre-trained weights.pt` to the `models/` directory according to your needs.
|
| 33 |
|
|
|
|
| 27 |
|
| 28 |
### Pretrained Models and Training Logs
|
| 29 |
|
| 30 |
+
We provide checkpoints of the MHG-GNN model pre-trained on a dataset of ~1.34M molecules curated from PubChem. (later) For model weights: [HuggingFace Link](https://huggingface.co/ibm/materials.mhg-ged/blob/main/mhggnn_pretrained_model_0724_2023.pickle)
|
| 31 |
|
| 32 |
Add the MHG-GNN `pre-trained weights.pt` to the `models/` directory according to your needs.
|
| 33 |
|
models/mhg_model/images/mhg_example.png
CHANGED
|
|
Git LFS Details
|
models/mhg_model/images/mhg_example1.png
CHANGED
|
|
Git LFS Details
|
models/mhg_model/images/mhg_example2.png
CHANGED
|
|
Git LFS Details
|
models/mhg_model/load.py
CHANGED
|
@@ -17,6 +17,7 @@ from typing_extensions import Self
|
|
| 17 |
|
| 18 |
from .graph_grammar.io.smi import hg_to_mol
|
| 19 |
from .models.mhgvae import GrammarGINVAE
|
|
|
|
| 20 |
from huggingface_hub import hf_hub_download
|
| 21 |
|
| 22 |
|
|
@@ -73,12 +74,30 @@ class PretrainedModelWrapper:
|
|
| 73 |
return output
|
| 74 |
|
| 75 |
|
| 76 |
-
def load(model_name: str = "
|
| 77 |
PretrainedModelWrapper]:
|
|
|
|
| 78 |
repo_id = "ibm/materials.mhg-ged"
|
| 79 |
filename = "pytorch_model.bin" #"mhggnn_pretrained_model_0724_2023.pickle"
|
| 80 |
file_path = hf_hub_download(repo_id=repo_id, filename=filename)
|
| 81 |
with open(file_path, "rb") as f:
|
| 82 |
model_dict = torch.load(f)
|
| 83 |
return PretrainedModelWrapper(model_dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
return None
|
|
|
|
| 17 |
|
| 18 |
from .graph_grammar.io.smi import hg_to_mol
|
| 19 |
from .models.mhgvae import GrammarGINVAE
|
| 20 |
+
|
| 21 |
from huggingface_hub import hf_hub_download
|
| 22 |
|
| 23 |
|
|
|
|
| 74 |
return output
|
| 75 |
|
| 76 |
|
| 77 |
+
def load(model_name: str = "mhg_model/pickles/mhggnn_pretrained_model_0724_2023.pickle") -> Optional[
|
| 78 |
PretrainedModelWrapper]:
|
| 79 |
+
|
| 80 |
repo_id = "ibm/materials.mhg-ged"
|
| 81 |
filename = "pytorch_model.bin" #"mhggnn_pretrained_model_0724_2023.pickle"
|
| 82 |
file_path = hf_hub_download(repo_id=repo_id, filename=filename)
|
| 83 |
with open(file_path, "rb") as f:
|
| 84 |
model_dict = torch.load(f)
|
| 85 |
return PretrainedModelWrapper(model_dict)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
"""try:
|
| 89 |
+
if os.path.isfile(model_name):
|
| 90 |
+
with open(model_name, "rb") as f:
|
| 91 |
+
model_dict = pickle.load(f)
|
| 92 |
+
print("MHG Model Loaded")
|
| 93 |
+
return PretrainedModelWrapper(model_dict)
|
| 94 |
+
|
| 95 |
+
except:
|
| 96 |
+
|
| 97 |
+
for p in sys.path:
|
| 98 |
+
file = p + "/" + model_name
|
| 99 |
+
if os.path.isfile(file):
|
| 100 |
+
with open(file, "rb") as f:
|
| 101 |
+
model_dict = pickle.load(f)
|
| 102 |
+
return PretrainedModelWrapper(model_dict)"""
|
| 103 |
return None
|
models/mhg_model/paper/MHG-GNN_Combination of Molecular Hypergraph Grammar with Graph Neural Network.pdf
CHANGED
|
Binary files a/models/mhg_model/paper/MHG-GNN_Combination of Molecular Hypergraph Grammar with Graph Neural Network.pdf and b/models/mhg_model/paper/MHG-GNN_Combination of Molecular Hypergraph Grammar with Graph Neural Network.pdf differ
|
|
|
models/selfies_model/selfies-ted.png
CHANGED
|
|
Git LFS Details
|
models/selfies_ted/README.md
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
library_name: transformers
|
| 4 |
+
pipeline_tag: feature-extraction
|
| 5 |
+
tags:
|
| 6 |
+
- chemistry
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
# selfies-ted
|
| 10 |
+
|
| 11 |
+
selfies-ted is a project for encoding SMILES (Simplified Molecular Input Line Entry System) into SELFIES (SELF-referencing Embedded Strings) and generating embeddings for molecular representations.
|
| 12 |
+
|
| 13 |
+

|
| 14 |
+
## Model Architecture
|
| 15 |
+
|
| 16 |
+
Configuration details
|
| 17 |
+
|
| 18 |
+
Encoder and Decoder FFN dimensions: 256
|
| 19 |
+
Number of attention heads: 4
|
| 20 |
+
Number of encoder and decoder layers: 2
|
| 21 |
+
Total number of hidden layers: 6
|
| 22 |
+
Maximum position embeddings: 128
|
| 23 |
+
Model dimension (d_model): 256
|
| 24 |
+
|
| 25 |
+
## Pretrained Models and Training Logs
|
| 26 |
+
We provide checkpoints of the selfies-ted model pre-trained on a dataset of molecules curated from PubChem. The pre-trained model shows competitive performance on molecular representation tasks. For model weights: "HuggingFace link".
|
| 27 |
+
|
| 28 |
+
To install and use the pre-trained model:
|
| 29 |
+
|
| 30 |
+
Download the selfies_ted_model.pkl file from the "HuggingFace link".
|
| 31 |
+
Add the selfies-ted selfies_ted_model.pkl to the models/ directory. The directory structure should look like the following:
|
| 32 |
+
|
| 33 |
+
```
|
| 34 |
+
models/
|
| 35 |
+
└── selfies_ted_model.pkl
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## Installation
|
| 39 |
+
|
| 40 |
+
To use this project, you'll need to install the required dependencies. We recommend using a virtual environment:
|
| 41 |
+
|
| 42 |
+
```bash
|
| 43 |
+
python -m venv venv
|
| 44 |
+
source venv/bin/activate # On Windows use `venv\Scripts\activate`
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
Install the required dependencies
|
| 48 |
+
|
| 49 |
+
```
|
| 50 |
+
pip install -r requirements.txt
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
## Usage
|
| 55 |
+
|
| 56 |
+
### Import
|
| 57 |
+
|
| 58 |
+
```
|
| 59 |
+
import load
|
| 60 |
+
```
|
| 61 |
+
### Training the Model
|
| 62 |
+
|
| 63 |
+
To train the model, use the train.py script:
|
| 64 |
+
|
| 65 |
+
```
|
| 66 |
+
python train.py -f <path_to_your_data_file>
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
Note: The actual usage may depend on the specific implementation in load.py. Please refer to the source code for detailed functionality.
|
| 71 |
+
|
| 72 |
+
### Load the model and tokenizer
|
| 73 |
+
```
|
| 74 |
+
load.load("path/to/checkpoint.pkl")
|
| 75 |
+
```
|
| 76 |
+
### Encode SMILES strings
|
| 77 |
+
```
|
| 78 |
+
smiles_list = ["COC", "CCO"]
|
| 79 |
+
```
|
| 80 |
+
```
|
| 81 |
+
embeddings = load.encode(smiles_list)
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
## Example Notebook
|
| 86 |
+
|
| 87 |
+
Example notebook of this project is `selfies-ted-example.ipynb`.
|
models/selfies_ted/load.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import torch
|
| 4 |
+
import selfies as sf # selfies>=2.1.1
|
| 5 |
+
import pickle
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import numpy as np
|
| 8 |
+
from datasets import Dataset
|
| 9 |
+
from rdkit import Chem
|
| 10 |
+
from transformers import AutoTokenizer, AutoModel
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class SELFIES(torch.nn.Module):
|
| 14 |
+
|
| 15 |
+
def __init__(self):
|
| 16 |
+
super().__init__()
|
| 17 |
+
self.model = None
|
| 18 |
+
self.tokenizer = None
|
| 19 |
+
self.invalid = []
|
| 20 |
+
|
| 21 |
+
def get_selfies(self, smiles_list):
|
| 22 |
+
self.invalid = []
|
| 23 |
+
spaced_selfies_batch = []
|
| 24 |
+
for i, smiles in enumerate(smiles_list):
|
| 25 |
+
try:
|
| 26 |
+
selfies = sf.encoder(smiles.rstrip())
|
| 27 |
+
except:
|
| 28 |
+
try:
|
| 29 |
+
smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles.rstrip()))
|
| 30 |
+
selfies = sf.encoder(smiles)
|
| 31 |
+
except:
|
| 32 |
+
selfies = "[]"
|
| 33 |
+
self.invalid.append(i)
|
| 34 |
+
|
| 35 |
+
spaced_selfies_batch.append(selfies.replace('][', '] ['))
|
| 36 |
+
|
| 37 |
+
return spaced_selfies_batch
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def get_embedding(self, selfies):
|
| 41 |
+
encoding = self.tokenizer(selfies["selfies"], return_tensors='pt', max_length=128, truncation=True, padding='max_length')
|
| 42 |
+
input_ids = encoding['input_ids']
|
| 43 |
+
attention_mask = encoding['attention_mask']
|
| 44 |
+
outputs = self.model.encoder(input_ids=input_ids, attention_mask=attention_mask)
|
| 45 |
+
model_output = outputs.last_hidden_state
|
| 46 |
+
|
| 47 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(model_output.size()).float()
|
| 48 |
+
sum_embeddings = torch.sum(model_output * input_mask_expanded, 1)
|
| 49 |
+
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 50 |
+
model_output = sum_embeddings / sum_mask
|
| 51 |
+
|
| 52 |
+
del encoding['input_ids']
|
| 53 |
+
del encoding['attention_mask']
|
| 54 |
+
|
| 55 |
+
encoding["embedding"] = model_output
|
| 56 |
+
|
| 57 |
+
return encoding
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def load(self, checkpoint="bart-2908.pickle"):
|
| 61 |
+
"""
|
| 62 |
+
inputs :
|
| 63 |
+
checkpoint (pickle object)
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
self.tokenizer = AutoTokenizer.from_pretrained("ibm/materials.selfies-ted")
|
| 67 |
+
self.model = AutoModel.from_pretrained("ibm/materials.selfies-ted")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# TODO: remove `use_gpu` argument in validation pipeline
|
| 74 |
+
def encode(self, smiles_list=[], use_gpu=False, return_tensor=False):
|
| 75 |
+
"""
|
| 76 |
+
inputs :
|
| 77 |
+
checkpoint (pickle object)
|
| 78 |
+
:return: embedding
|
| 79 |
+
"""
|
| 80 |
+
selfies = self.get_selfies(smiles_list)
|
| 81 |
+
selfies_df = pd.DataFrame(selfies,columns=["selfies"])
|
| 82 |
+
data = Dataset.from_pandas(selfies_df)
|
| 83 |
+
embedding = data.map(self.get_embedding, batched=True, num_proc=1, batch_size=128)
|
| 84 |
+
emb = np.asarray(embedding["embedding"].copy())
|
| 85 |
+
|
| 86 |
+
for idx in self.invalid:
|
| 87 |
+
emb[idx] = np.nan
|
| 88 |
+
print("Cannot encode {0} to selfies and embedding replaced by NaN".format(smiles_list[idx]))
|
| 89 |
+
|
| 90 |
+
if return_tensor:
|
| 91 |
+
return torch.tensor(emb)
|
| 92 |
+
return pd.DataFrame(emb)
|
models/selfies_ted/requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=2.1.0
|
| 2 |
+
transformers>=4.38
|
| 3 |
+
numpy>=1.26.1
|
| 4 |
+
datasets>=2.13.1
|
| 5 |
+
evaluate>=0.4.0
|
| 6 |
+
selfies>=2.1.0
|
| 7 |
+
scikit-learn>=1.2.1
|
| 8 |
+
pyarrow>=14.0.1
|
| 9 |
+
requests>=2.31.0
|
| 10 |
+
urllib3>=2.0.7
|
| 11 |
+
aiohttp>=3.9.0
|
| 12 |
+
zipp>=3.17.0
|
models/selfies_ted/selfies-ted-example.ipynb
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "9d9b6eb8-9edb-44bd-9e5a-3a6ea67f5117",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"### Import library"
|
| 9 |
+
]
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"cell_type": "code",
|
| 13 |
+
"execution_count": 1,
|
| 14 |
+
"id": "c3ac4418",
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"from load import SELFIES"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"cell_type": "markdown",
|
| 23 |
+
"id": "790061cf-5470-4564-987e-aa2e492337db",
|
| 24 |
+
"metadata": {},
|
| 25 |
+
"source": [
|
| 26 |
+
"### Initialize and load"
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"cell_type": "code",
|
| 31 |
+
"execution_count": 2,
|
| 32 |
+
"id": "85847f26-e2f4-475a-a88e-41fd9cccfc0f",
|
| 33 |
+
"metadata": {},
|
| 34 |
+
"outputs": [],
|
| 35 |
+
"source": [
|
| 36 |
+
"model = SELFIES()"
|
| 37 |
+
]
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"cell_type": "code",
|
| 41 |
+
"execution_count": 3,
|
| 42 |
+
"id": "095e864c",
|
| 43 |
+
"metadata": {
|
| 44 |
+
"scrolled": true
|
| 45 |
+
},
|
| 46 |
+
"outputs": [],
|
| 47 |
+
"source": [
|
| 48 |
+
"model.load(checkpoint=\"bart-2908.pickle\")"
|
| 49 |
+
]
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"cell_type": "markdown",
|
| 53 |
+
"id": "55f1a68c-c462-4dee-9139-9befb469f176",
|
| 54 |
+
"metadata": {},
|
| 55 |
+
"source": [
|
| 56 |
+
"### Example to get embeddings"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"cell_type": "code",
|
| 61 |
+
"execution_count": 4,
|
| 62 |
+
"id": "2357ef0a",
|
| 63 |
+
"metadata": {},
|
| 64 |
+
"outputs": [
|
| 65 |
+
{
|
| 66 |
+
"data": {
|
| 67 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 68 |
+
"model_id": "b494cbf9878a4f5c8f4093e38fb82fd5",
|
| 69 |
+
"version_major": 2,
|
| 70 |
+
"version_minor": 0
|
| 71 |
+
},
|
| 72 |
+
"text/plain": [
|
| 73 |
+
"Map: 0%| | 0/3 [00:00<?, ? examples/s]"
|
| 74 |
+
]
|
| 75 |
+
},
|
| 76 |
+
"metadata": {},
|
| 77 |
+
"output_type": "display_data"
|
| 78 |
+
}
|
| 79 |
+
],
|
| 80 |
+
"source": [
|
| 81 |
+
"smiles_list = [\"CCO\", \"O=C=O\", \"OC(=O)c1ccccc1C(=O)O\"]\n",
|
| 82 |
+
"embeddings = model.encode(smiles_list)"
|
| 83 |
+
]
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"cell_type": "code",
|
| 87 |
+
"execution_count": 5,
|
| 88 |
+
"id": "3871c513-d0a9-4e70-9c18-3f0b491e07b2",
|
| 89 |
+
"metadata": {},
|
| 90 |
+
"outputs": [
|
| 91 |
+
{
|
| 92 |
+
"data": {
|
| 93 |
+
"text/plain": [
|
| 94 |
+
"(3, 1024)"
|
| 95 |
+
]
|
| 96 |
+
},
|
| 97 |
+
"execution_count": 5,
|
| 98 |
+
"metadata": {},
|
| 99 |
+
"output_type": "execute_result"
|
| 100 |
+
}
|
| 101 |
+
],
|
| 102 |
+
"source": [
|
| 103 |
+
"embeddings.shape"
|
| 104 |
+
]
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"cell_type": "code",
|
| 108 |
+
"execution_count": null,
|
| 109 |
+
"id": "289a8795-d6d8-4828-b2b2-b4d4a97a4604",
|
| 110 |
+
"metadata": {},
|
| 111 |
+
"outputs": [],
|
| 112 |
+
"source": []
|
| 113 |
+
}
|
| 114 |
+
],
|
| 115 |
+
"metadata": {
|
| 116 |
+
"kernelspec": {
|
| 117 |
+
"display_name": "Python 3 (ipykernel)",
|
| 118 |
+
"language": "python",
|
| 119 |
+
"name": "python3"
|
| 120 |
+
},
|
| 121 |
+
"language_info": {
|
| 122 |
+
"codemirror_mode": {
|
| 123 |
+
"name": "ipython",
|
| 124 |
+
"version": 3
|
| 125 |
+
},
|
| 126 |
+
"file_extension": ".py",
|
| 127 |
+
"mimetype": "text/x-python",
|
| 128 |
+
"name": "python",
|
| 129 |
+
"nbconvert_exporter": "python",
|
| 130 |
+
"pygments_lexer": "ipython3",
|
| 131 |
+
"version": "3.10.8"
|
| 132 |
+
}
|
| 133 |
+
},
|
| 134 |
+
"nbformat": 4,
|
| 135 |
+
"nbformat_minor": 5
|
| 136 |
+
}
|
models/selfies_ted/selfies-ted.png
ADDED
|
Git LFS Details
|
models/smi_ted/.gitignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Model weights
|
| 2 |
+
inference/smi_ted_light/smi-ted-Light_40.pt
|
| 3 |
+
|
| 4 |
+
# pyenv
|
| 5 |
+
.python-version
|
| 6 |
+
|
| 7 |
+
# Environments
|
| 8 |
+
.env
|
| 9 |
+
.venv
|
| 10 |
+
env/
|
| 11 |
+
venv/
|
| 12 |
+
ENV/
|
| 13 |
+
env.bak/
|
| 14 |
+
venv.bak/
|
| 15 |
+
|
| 16 |
+
# editor files
|
| 17 |
+
.vscode/
|
| 18 |
+
.DS_Store
|
models/smi_ted/README.md
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SMILES-based Transformer Encoder-Decoder (SMI-TED)
|
| 2 |
+
|
| 3 |
+
This repository provides PyTorch source code associated with our publication, "A Large Encoder-Decoder Family of Foundation Models for Chemical Language".
|
| 4 |
+
|
| 5 |
+
**Paper:** [Arxiv Link](https://arxiv.org/abs/2407.20267)
|
| 6 |
+
|
| 7 |
+
**HuggingFace:** [HuggingFace Link](https://huggingface.co/ibm/materials.smi-ted)
|
| 8 |
+
|
| 9 |
+
For more information contact: [email protected] or [email protected].
|
| 10 |
+
|
| 11 |
+

|
| 12 |
+
|
| 13 |
+
## Introduction
|
| 14 |
+
|
| 15 |
+
We present a large encoder-decoder chemical foundation model, SMILES-based Transformer Encoder-Decoder (SMI-TED), pre-trained on a curated dataset of 91 million SMILES samples sourced from PubChem, equivalent to 4 billion molecular tokens. SMI-TED supports various complex tasks, including quantum property prediction, with two main variants ($289M$ and $8 \times 289M$). Our experiments across multiple benchmark datasets demonstrate state-of-the-art performance for various tasks. Model weights are available at: [HuggingFace Link](https://huggingface.co/ibm/materials.smi-ted).
|
| 16 |
+
|
| 17 |
+
## Table of Contents
|
| 18 |
+
|
| 19 |
+
1. [Getting Started](#getting-started)
|
| 20 |
+
1. [Pretrained Models and Training Logs](#pretrained-models-and-training-logs)
|
| 21 |
+
2. [Replicating Conda Environment](#replicating-conda-environment)
|
| 22 |
+
2. [Pretraining](#pretraining)
|
| 23 |
+
3. [Finetuning](#finetuning)
|
| 24 |
+
4. [Feature Extraction](#feature-extraction)
|
| 25 |
+
5. [Citations](#citations)
|
| 26 |
+
|
| 27 |
+
## Getting Started
|
| 28 |
+
|
| 29 |
+
**This code and environment have been tested on Nvidia V100s and Nvidia A100s**
|
| 30 |
+
|
| 31 |
+
### Pretrained Models and Training Logs
|
| 32 |
+
|
| 33 |
+
We provide checkpoints of the SMI-TED model pre-trained on a dataset of ~91M molecules curated from PubChem. The pre-trained model shows competitive performance on classification and regression benchmarks from MoleculeNet. For model weights: [HuggingFace Link](https://huggingface.co/ibm/materials.smi-ted)
|
| 34 |
+
|
| 35 |
+
Add the SMI-TED `pre-trained weights.pt` to the `inference/` or `finetune/` directory according to your needs. The directory structure should look like the following:
|
| 36 |
+
|
| 37 |
+
```
|
| 38 |
+
inference/
|
| 39 |
+
├── smi_ted_light
|
| 40 |
+
│ ├── smi_ted_light.pt
|
| 41 |
+
│ ├── bert_vocab_curated.txt
|
| 42 |
+
│ └── load.py
|
| 43 |
+
```
|
| 44 |
+
and/or:
|
| 45 |
+
|
| 46 |
+
```
|
| 47 |
+
finetune/
|
| 48 |
+
├── smi_ted_light
|
| 49 |
+
│ ├── smi_ted_light.pt
|
| 50 |
+
│ ├── bert_vocab_curated.txt
|
| 51 |
+
│ └── load.py
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
### Replicating Conda Environment
|
| 55 |
+
|
| 56 |
+
Follow these steps to replicate our Conda environment and install the necessary libraries:
|
| 57 |
+
|
| 58 |
+
#### Create and Activate Conda Environment
|
| 59 |
+
|
| 60 |
+
```
|
| 61 |
+
conda create --name smi-ted-env python=3.10
|
| 62 |
+
conda activate smi-ted-env
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
#### Install Packages with Conda
|
| 66 |
+
|
| 67 |
+
```
|
| 68 |
+
conda install pytorch=2.1.0 pytorch-cuda=11.8 -c pytorch -c nvidia
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
#### Install Packages with Pip
|
| 72 |
+
|
| 73 |
+
```
|
| 74 |
+
pip install -r requirements.txt
|
| 75 |
+
pip install pytorch-fast-transformers
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
## Pretraining
|
| 79 |
+
|
| 80 |
+
For pretraining, we use two strategies: the masked language model method to train the encoder part and an encoder-decoder strategy to refine SMILES reconstruction and improve the generated latent space.
|
| 81 |
+
|
| 82 |
+
SMI-TED is pre-trained on canonicalized and curated 91M SMILES from PubChem with the following constraints:
|
| 83 |
+
|
| 84 |
+
- Compounds are filtered to a maximum length of 202 tokens during preprocessing.
|
| 85 |
+
- A 95/5/0 split is used for encoder training, with 5% of the data for decoder pretraining.
|
| 86 |
+
- A 100/0/0 split is also used to train the encoder and decoder directly, enhancing model performance.
|
| 87 |
+
|
| 88 |
+
The pretraining code provides examples of data processing and model training on a smaller dataset, requiring 8 A100 GPUs.
|
| 89 |
+
|
| 90 |
+
To pre-train the two variants of the SMI-TED model, run:
|
| 91 |
+
|
| 92 |
+
```
|
| 93 |
+
bash training/run_model_light_training.sh
|
| 94 |
+
```
|
| 95 |
+
or
|
| 96 |
+
```
|
| 97 |
+
bash training/run_model_large_training.sh
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
Use `train_model_D.py` to train only the decoder or `train_model_ED.py` to train both the encoder and decoder.
|
| 101 |
+
|
| 102 |
+
## Finetuning
|
| 103 |
+
|
| 104 |
+
The finetuning datasets and environment can be found in the [finetune](finetune/) directory. After setting up the environment, you can run a finetuning task with:
|
| 105 |
+
|
| 106 |
+
```
|
| 107 |
+
bash finetune/smi_ted_light/esol/run_finetune_esol.sh
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
Finetuning training/checkpointing resources will be available in directories named `checkpoint_<measure_name>`.
|
| 111 |
+
|
| 112 |
+
## Feature Extraction
|
| 113 |
+
|
| 114 |
+
The example notebook [smi_ted_encoder_decoder_example.ipynb](notebooks/smi_ted_encoder_decoder_example.ipynb) contains code to load checkpoint files and use the pre-trained model for encoder and decoder tasks. It also includes examples of classification and regression tasks. For model weights: [HuggingFace Link](https://huggingface.co/ibm/materials.smi-ted)
|
| 115 |
+
|
| 116 |
+
To load smi-ted, you can simply use:
|
| 117 |
+
|
| 118 |
+
```python
|
| 119 |
+
model = load_smi_ted(
|
| 120 |
+
folder='../inference/smi_ted_light',
|
| 121 |
+
ckpt_filename='smi_ted_light.pt'
|
| 122 |
+
)
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
To encode SMILES into embeddings, you can use:
|
| 126 |
+
|
| 127 |
+
```python
|
| 128 |
+
with torch.no_grad():
|
| 129 |
+
encoded_embeddings = model.encode(df['SMILES'], return_torch=True)
|
| 130 |
+
```
|
| 131 |
+
For decoder, you can use the function, so you can return from embeddings to SMILES strings:
|
| 132 |
+
|
| 133 |
+
```python
|
| 134 |
+
with torch.no_grad():
|
| 135 |
+
decoded_smiles = model.decode(encoded_embeddings)
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
|
models/smi_ted/finetune/args.py
ADDED
|
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def get_parser(parser=None):
|
| 5 |
+
if parser is None:
|
| 6 |
+
parser = argparse.ArgumentParser()
|
| 7 |
+
|
| 8 |
+
# Model
|
| 9 |
+
# model_arg = parser.add_argument_group('Model')
|
| 10 |
+
parser.add_argument("--n_head", type=int, default=8, help="GPT number of heads")
|
| 11 |
+
parser.add_argument("--n_layer", type=int, default=12, help="GPT number of layers")
|
| 12 |
+
parser.add_argument(
|
| 13 |
+
"--q_dropout", type=float, default=0.5, help="Encoder layers dropout"
|
| 14 |
+
)
|
| 15 |
+
parser.add_argument(
|
| 16 |
+
"--d_dropout", type=float, default=0.1, help="Decoder layers dropout"
|
| 17 |
+
)
|
| 18 |
+
parser.add_argument(
|
| 19 |
+
"--n_embd", type=int, default=768, help="Latent vector dimensionality"
|
| 20 |
+
)
|
| 21 |
+
parser.add_argument(
|
| 22 |
+
"--fc_h", type=int, default=512, help="Fully connected hidden dimensionality"
|
| 23 |
+
)
|
| 24 |
+
parser.add_argument("--n_output", type=int, default=1)
|
| 25 |
+
|
| 26 |
+
# Train
|
| 27 |
+
# train_arg = parser.add_argument_group('Train')
|
| 28 |
+
parser.add_argument("--n_batch", type=int, default=512, help="Batch size")
|
| 29 |
+
parser.add_argument(
|
| 30 |
+
"--unlike_alpha", type=float, default=1.0, help="unlikelihood loss alpha weight"
|
| 31 |
+
)
|
| 32 |
+
parser.add_argument(
|
| 33 |
+
"--from_scratch",
|
| 34 |
+
action="store_true",
|
| 35 |
+
default=False,
|
| 36 |
+
help="train on qm9 from scratch",
|
| 37 |
+
)
|
| 38 |
+
parser.add_argument(
|
| 39 |
+
"--unlikelihood",
|
| 40 |
+
action="store_true",
|
| 41 |
+
default=False,
|
| 42 |
+
help="use unlikelihood loss with gpt pretrain",
|
| 43 |
+
)
|
| 44 |
+
parser.add_argument(
|
| 45 |
+
"--grad_acc",
|
| 46 |
+
type=int,
|
| 47 |
+
default=1,
|
| 48 |
+
help="number of batches to accumulate gradients",
|
| 49 |
+
)
|
| 50 |
+
parser.add_argument(
|
| 51 |
+
"--checkpoint_every",
|
| 52 |
+
type=int,
|
| 53 |
+
default=1000,
|
| 54 |
+
help="save checkpoint every x iterations",
|
| 55 |
+
)
|
| 56 |
+
parser.add_argument(
|
| 57 |
+
"--clip_grad", type=int, default=50, help="Clip gradients to this value"
|
| 58 |
+
)
|
| 59 |
+
parser.add_argument(
|
| 60 |
+
"--lr_start", type=float, default=3 * 1e-4, help="Initial lr value"
|
| 61 |
+
)
|
| 62 |
+
parser.add_argument(
|
| 63 |
+
"--lr_end", type=float, default=3 * 1e-4, help="Maximum lr weight value"
|
| 64 |
+
)
|
| 65 |
+
parser.add_argument(
|
| 66 |
+
"--lr_multiplier", type=int, default=1, help="lr weight multiplier"
|
| 67 |
+
)
|
| 68 |
+
parser.add_argument(
|
| 69 |
+
"--n_last", type=int, default=1000, help="Number of iters to smooth loss calc"
|
| 70 |
+
)
|
| 71 |
+
parser.add_argument("--n_jobs", type=int, default=1, help="Number of threads")
|
| 72 |
+
parser.add_argument(
|
| 73 |
+
"--accelerator",
|
| 74 |
+
type=str,
|
| 75 |
+
default="ddp",
|
| 76 |
+
help="The accelerator backend to use (previously known as distributed_backend)",
|
| 77 |
+
)
|
| 78 |
+
parser.add_argument(
|
| 79 |
+
"--num_nodes",
|
| 80 |
+
type=int,
|
| 81 |
+
default=1,
|
| 82 |
+
help="number of GPU nodes for distributed training",
|
| 83 |
+
)
|
| 84 |
+
parser.add_argument(
|
| 85 |
+
"--device",
|
| 86 |
+
type=str,
|
| 87 |
+
default="cuda",
|
| 88 |
+
help='Device to run: "cpu" or "cuda:<device number>"',
|
| 89 |
+
)
|
| 90 |
+
parser.add_argument("--seed", type=int, default=12345, help="Seed")
|
| 91 |
+
parser.add_argument(
|
| 92 |
+
"--init_params_from",
|
| 93 |
+
type=str,
|
| 94 |
+
default="",
|
| 95 |
+
help="Path to a ckpt used to initialize the parameters if no restart_path is provided",
|
| 96 |
+
)
|
| 97 |
+
parser.add_argument(
|
| 98 |
+
"--train_decoder_every",
|
| 99 |
+
type=int,
|
| 100 |
+
default=10,
|
| 101 |
+
help="Optimize decoder params every n batches",
|
| 102 |
+
)
|
| 103 |
+
parser.add_argument(
|
| 104 |
+
"--lr_decoder", type=float, default=1e-4, help="Learning rate for decoder part"
|
| 105 |
+
)
|
| 106 |
+
parser.add_argument(
|
| 107 |
+
"--local_rank",
|
| 108 |
+
type=int,
|
| 109 |
+
default=-1,
|
| 110 |
+
help="local_rank for distributed training on gpus",
|
| 111 |
+
)
|
| 112 |
+
parser.add_argument("--gpu", default=None, type=int, help="GPU id to use.")
|
| 113 |
+
parser.add_argument(
|
| 114 |
+
"--dist-backend", default="nccl", type=str, help="distributed backend"
|
| 115 |
+
)
|
| 116 |
+
parser.add_argument(
|
| 117 |
+
"--tensorboard_path", default="./runs/deepspeed", help="tensorboard log dir"
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# common_arg = parser.add_argument_group('Common')
|
| 121 |
+
parser.add_argument(
|
| 122 |
+
"--vocab_load", type=str, required=False, help="Where to load the vocab"
|
| 123 |
+
)
|
| 124 |
+
parser.add_argument(
|
| 125 |
+
"--n_samples", type=int, required=False, help="Number of samples to sample"
|
| 126 |
+
)
|
| 127 |
+
parser.add_argument(
|
| 128 |
+
"--gen_save", type=str, required=False, help="Where to save the gen molecules"
|
| 129 |
+
)
|
| 130 |
+
parser.add_argument(
|
| 131 |
+
"--max_len", type=int, default=100, help="Max of length of SMILES"
|
| 132 |
+
)
|
| 133 |
+
parser.add_argument(
|
| 134 |
+
"--train_load", type=str, required=False, help="Where to load the model"
|
| 135 |
+
)
|
| 136 |
+
parser.add_argument(
|
| 137 |
+
"--val_load", type=str, required=False, help="Where to load the model"
|
| 138 |
+
)
|
| 139 |
+
parser.add_argument(
|
| 140 |
+
"--n_workers",
|
| 141 |
+
type=int,
|
| 142 |
+
required=False,
|
| 143 |
+
default=1,
|
| 144 |
+
help="Where to load the model",
|
| 145 |
+
)
|
| 146 |
+
# beam search hyper parameters
|
| 147 |
+
parser.add_argument(
|
| 148 |
+
"--beam_size", type=int, default=0, help="Number of beams to generate"
|
| 149 |
+
)
|
| 150 |
+
parser.add_argument(
|
| 151 |
+
"--num_seq_returned",
|
| 152 |
+
type=int,
|
| 153 |
+
default=0,
|
| 154 |
+
help="number of beams to be returned (must be <= beam_size",
|
| 155 |
+
)
|
| 156 |
+
parser.add_argument(
|
| 157 |
+
"--min_len", type=int, default=1, help="minimum length to be generated"
|
| 158 |
+
)
|
| 159 |
+
parser.add_argument(
|
| 160 |
+
"--nucleus_thresh", type=float, default=0.9, help="nucleus sampling threshold"
|
| 161 |
+
)
|
| 162 |
+
parser.add_argument(
|
| 163 |
+
"--finetune_path",
|
| 164 |
+
type=str,
|
| 165 |
+
default="",
|
| 166 |
+
help="path to trainer file to continue training",
|
| 167 |
+
)
|
| 168 |
+
parser.add_argument(
|
| 169 |
+
"--restart_path",
|
| 170 |
+
type=str,
|
| 171 |
+
default="",
|
| 172 |
+
help="path to trainer file to continue training",
|
| 173 |
+
)
|
| 174 |
+
parser.add_argument(
|
| 175 |
+
"--data_path", type=str, default="", help="path to pubchem file"
|
| 176 |
+
)
|
| 177 |
+
parser.add_argument(
|
| 178 |
+
"--pretext_size", type=int, default=0, help="number of k-mers to pretext"
|
| 179 |
+
)
|
| 180 |
+
parser.add_argument(
|
| 181 |
+
"--model_save_dir",
|
| 182 |
+
type=str,
|
| 183 |
+
required=False,
|
| 184 |
+
default="./models_dump/",
|
| 185 |
+
help="Where to save the models/log/config/vocab",
|
| 186 |
+
)
|
| 187 |
+
parser.add_argument(
|
| 188 |
+
"--model_save",
|
| 189 |
+
type=str,
|
| 190 |
+
required=False,
|
| 191 |
+
default="model.pt",
|
| 192 |
+
help="Where to save the model",
|
| 193 |
+
)
|
| 194 |
+
# parser.add_argument('--save_frequency',
|
| 195 |
+
# type=int, default=20,
|
| 196 |
+
# help='How often to save the model')
|
| 197 |
+
parser.add_argument(
|
| 198 |
+
"--num_epoch", type=int, default=1, help="number of epochs to train"
|
| 199 |
+
)
|
| 200 |
+
# parser.add_argument('--num_iter',
|
| 201 |
+
# type=int, default=-1,
|
| 202 |
+
# help='how many itersations per epoch (for unlikelihood tuning)')
|
| 203 |
+
parser.add_argument(
|
| 204 |
+
"--log_file", type=str, required=False, help="Where to save the log"
|
| 205 |
+
)
|
| 206 |
+
parser.add_argument(
|
| 207 |
+
"--tb_loc",
|
| 208 |
+
type=str,
|
| 209 |
+
required=False,
|
| 210 |
+
help="Where to save the tensorflow location",
|
| 211 |
+
)
|
| 212 |
+
parser.add_argument(
|
| 213 |
+
"--config_save", type=str, required=False, help="Where to save the config"
|
| 214 |
+
)
|
| 215 |
+
parser.add_argument("--vocab_save", type=str, help="Where to save the vocab")
|
| 216 |
+
|
| 217 |
+
# resume_arg = parser.add_argument_group('Resume')
|
| 218 |
+
parser.add_argument(
|
| 219 |
+
"--debug",
|
| 220 |
+
default=False,
|
| 221 |
+
action="store_true",
|
| 222 |
+
help="do not erase cache at end of program",
|
| 223 |
+
)
|
| 224 |
+
parser.add_argument(
|
| 225 |
+
"--fast_dev_run",
|
| 226 |
+
default=False,
|
| 227 |
+
help="This flag runs a “unit test” by running n if set to n (int) else 1 if set to True training and validation batch(es).",
|
| 228 |
+
)
|
| 229 |
+
parser.add_argument(
|
| 230 |
+
"--freeze_model",
|
| 231 |
+
default=False,
|
| 232 |
+
action="store_true",
|
| 233 |
+
help="freeze weights of bert model during fine tuning",
|
| 234 |
+
)
|
| 235 |
+
parser.add_argument(
|
| 236 |
+
"--resume", default=False, action="store_true", help="Resume from a saved model"
|
| 237 |
+
)
|
| 238 |
+
parser.add_argument(
|
| 239 |
+
"--rotate",
|
| 240 |
+
default=False,
|
| 241 |
+
action="store_true",
|
| 242 |
+
help="use rotational relative embedding",
|
| 243 |
+
)
|
| 244 |
+
parser.add_argument(
|
| 245 |
+
"--model_load", type=str, required=False, help="Where to load the model"
|
| 246 |
+
)
|
| 247 |
+
parser.add_argument(
|
| 248 |
+
"--root_dir", type=str, required=False, default=".", help="location of root dir"
|
| 249 |
+
)
|
| 250 |
+
parser.add_argument(
|
| 251 |
+
"--config_load", type=str, required=False, help="Where to load the config"
|
| 252 |
+
)
|
| 253 |
+
parser.add_argument(
|
| 254 |
+
"--gpus", type=int, required=False, default=1, help="number of gpus to use"
|
| 255 |
+
)
|
| 256 |
+
# parser.add_argument('--start_epoch',
|
| 257 |
+
# type=int, required=False, default=0,
|
| 258 |
+
# help='Where to load the config')
|
| 259 |
+
|
| 260 |
+
parser.add_argument(
|
| 261 |
+
"--model_arch",
|
| 262 |
+
type=str,
|
| 263 |
+
required=False,
|
| 264 |
+
help="used to teack model arch in params",
|
| 265 |
+
)
|
| 266 |
+
parser.add_argument(
|
| 267 |
+
"--eval_every",
|
| 268 |
+
type=int,
|
| 269 |
+
default=50000,
|
| 270 |
+
help="run evaluation every x iterations",
|
| 271 |
+
)
|
| 272 |
+
parser.add_argument(
|
| 273 |
+
"--num_feats",
|
| 274 |
+
type=int,
|
| 275 |
+
required=False,
|
| 276 |
+
default=32,
|
| 277 |
+
help="number of random reatures for FAVOR+",
|
| 278 |
+
)
|
| 279 |
+
parser.add_argument(
|
| 280 |
+
"--max_epochs", type=int, required=False, default=1, help="max number of epochs"
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
# debug() FINE TUNEING
|
| 284 |
+
# parser.add_argument('--save_dir', type=str, required=True)
|
| 285 |
+
parser.add_argument(
|
| 286 |
+
"--mode", type=str, default="cls", help="type of pooling to use"
|
| 287 |
+
)
|
| 288 |
+
parser.add_argument("--dataset_length", type=int, default=None, required=False)
|
| 289 |
+
parser.add_argument("--num_workers", type=int, default=0, required=False)
|
| 290 |
+
parser.add_argument("--dropout", type=float, default=0.1, required=False)
|
| 291 |
+
# parser.add_argument("--dims", type=int, nargs="*", default="", required=False)
|
| 292 |
+
parser.add_argument(
|
| 293 |
+
"--smiles_embedding",
|
| 294 |
+
type=str,
|
| 295 |
+
default="/dccstor/medscan7/smallmolecule/runs/ba-predictor/small-data/embeddings/protein/ba_embeddings_tanh_512_2986138_2.pt",
|
| 296 |
+
)
|
| 297 |
+
# parser.add_argument("--train_pct", type=str, required=False, default="95")
|
| 298 |
+
# parser.add_argument("--aug", type=int, required=True)
|
| 299 |
+
parser.add_argument("--dataset_name", type=str, required=False, default="sol")
|
| 300 |
+
parser.add_argument("--measure_name", type=str, required=False, default="measure")
|
| 301 |
+
# parser.add_argument("--emb_type", type=str, required=True)
|
| 302 |
+
parser.add_argument("--checkpoints_folder", type=str, required=True)
|
| 303 |
+
# parser.add_argument("--results_dir", type=str, required=True)
|
| 304 |
+
# parser.add_argument("--patience_epochs", type=int, required=True)
|
| 305 |
+
parser.add_argument("--model_path", type=str, default="./smi_ted/")
|
| 306 |
+
parser.add_argument("--ckpt_filename", type=str, default="smi_ted_Light_40.pt")
|
| 307 |
+
parser.add_argument("--restart_filename", type=str, default="")
|
| 308 |
+
# parser.add_argument('--n_output', type=int, default=1)
|
| 309 |
+
parser.add_argument("--save_every_epoch", type=int, default=0)
|
| 310 |
+
parser.add_argument("--save_ckpt", type=int, default=1)
|
| 311 |
+
parser.add_argument("--start_seed", type=int, default=0)
|
| 312 |
+
parser.add_argument("--smi_ted_version", type=str, default="v1")
|
| 313 |
+
parser.add_argument("--train_decoder", type=int, default=1)
|
| 314 |
+
parser.add_argument("--target_metric", type=str, default="rmse")
|
| 315 |
+
parser.add_argument("--loss_fn", type=str, default="mae")
|
| 316 |
+
|
| 317 |
+
parser.add_argument(
|
| 318 |
+
"--data_root",
|
| 319 |
+
type=str,
|
| 320 |
+
required=False,
|
| 321 |
+
default="/dccstor/medscan7/smallmolecule/runs/ba-predictor/small-data/affinity",
|
| 322 |
+
)
|
| 323 |
+
# parser.add_argument("--use_bn", type=int, default=0)
|
| 324 |
+
parser.add_argument("--use_linear", type=int, default=0)
|
| 325 |
+
|
| 326 |
+
parser.add_argument("--lr", type=float, default=0.001)
|
| 327 |
+
# parser.add_argument("--weight_decay", type=float, default=5e-4)
|
| 328 |
+
# parser.add_argument("--val_check_interval", type=float, default=1.0)
|
| 329 |
+
parser.add_argument("--batch_size", type=int, default=64)
|
| 330 |
+
|
| 331 |
+
return parser
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def parse_args():
|
| 335 |
+
parser = get_parser()
|
| 336 |
+
args = parser.parse_args()
|
| 337 |
+
return args
|
models/smi_ted/finetune/finetune_classification.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Deep learning
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
from torch import optim
|
| 5 |
+
from trainers import TrainerClassifier
|
| 6 |
+
from utils import get_optim_groups
|
| 7 |
+
|
| 8 |
+
# Data
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
# Standard library
|
| 13 |
+
import args
|
| 14 |
+
import os
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def main(config):
|
| 18 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 19 |
+
|
| 20 |
+
# load dataset
|
| 21 |
+
df_train = pd.read_csv(f"{config.data_root}/train.csv")
|
| 22 |
+
df_valid = pd.read_csv(f"{config.data_root}/valid.csv")
|
| 23 |
+
df_test = pd.read_csv(f"{config.data_root}/test.csv")
|
| 24 |
+
|
| 25 |
+
# load model
|
| 26 |
+
if config.smi_ted_version == 'v1':
|
| 27 |
+
from smi_ted_light.load import load_smi_ted
|
| 28 |
+
elif config.smi_ted_version == 'v2':
|
| 29 |
+
from smi_ted_large.load import load_smi_ted
|
| 30 |
+
|
| 31 |
+
model = load_smi_ted(folder=config.model_path, ckpt_filename=config.ckpt_filename, n_output=config.n_output, eval=False)
|
| 32 |
+
model.net.apply(model._init_weights)
|
| 33 |
+
print(model.net)
|
| 34 |
+
|
| 35 |
+
lr = config.lr_start*config.lr_multiplier
|
| 36 |
+
optim_groups = get_optim_groups(model, keep_decoder=bool(config.train_decoder))
|
| 37 |
+
if config.loss_fn == 'crossentropy':
|
| 38 |
+
loss_function = nn.CrossEntropyLoss()
|
| 39 |
+
|
| 40 |
+
# init trainer
|
| 41 |
+
trainer = TrainerClassifier(
|
| 42 |
+
raw_data=(df_train, df_valid, df_test),
|
| 43 |
+
dataset_name=config.dataset_name,
|
| 44 |
+
target=config.measure_name,
|
| 45 |
+
batch_size=config.n_batch,
|
| 46 |
+
hparams=config,
|
| 47 |
+
target_metric=config.target_metric,
|
| 48 |
+
seed=config.start_seed,
|
| 49 |
+
smi_ted_version=config.smi_ted_version,
|
| 50 |
+
checkpoints_folder=config.checkpoints_folder,
|
| 51 |
+
restart_filename=config.restart_filename,
|
| 52 |
+
device=device,
|
| 53 |
+
save_every_epoch=bool(config.save_every_epoch),
|
| 54 |
+
save_ckpt=bool(config.save_ckpt)
|
| 55 |
+
)
|
| 56 |
+
trainer.compile(
|
| 57 |
+
model=model,
|
| 58 |
+
optimizer=optim.AdamW(optim_groups, lr=lr, betas=(0.9, 0.99)),
|
| 59 |
+
loss_fn=loss_function
|
| 60 |
+
)
|
| 61 |
+
trainer.fit(max_epochs=config.max_epochs)
|
| 62 |
+
trainer.evaluate()
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
if __name__ == '__main__':
|
| 66 |
+
parser = args.get_parser()
|
| 67 |
+
config = parser.parse_args()
|
| 68 |
+
main(config)
|
models/smi_ted/finetune/finetune_classification_multitask.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Deep learning
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
from torch import optim
|
| 5 |
+
from trainers import TrainerClassifierMultitask
|
| 6 |
+
from utils import get_optim_groups
|
| 7 |
+
|
| 8 |
+
# Data
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
# Standard library
|
| 13 |
+
import args
|
| 14 |
+
import os
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def main(config):
|
| 18 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 19 |
+
|
| 20 |
+
# Define Target and Causal Features
|
| 21 |
+
if config.dataset_name == 'tox21':
|
| 22 |
+
targets = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
|
| 23 |
+
'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']
|
| 24 |
+
elif config.dataset_name == 'clintox':
|
| 25 |
+
targets = ['FDA_APPROVED', 'CT_TOX']
|
| 26 |
+
elif config.dataset_name == 'sider':
|
| 27 |
+
targets = [
|
| 28 |
+
'Hepatobiliary disorders', 'Metabolism and nutrition disorders',
|
| 29 |
+
'Product issues', 'Eye disorders', 'Investigations',
|
| 30 |
+
'Musculoskeletal and connective tissue disorders',
|
| 31 |
+
'Gastrointestinal disorders', 'Social circumstances',
|
| 32 |
+
'Immune system disorders', 'Reproductive system and breast disorders',
|
| 33 |
+
'Neoplasms benign, malignant and unspecified (incl cysts and polyps)',
|
| 34 |
+
'General disorders and administration site conditions',
|
| 35 |
+
'Endocrine disorders', 'Surgical and medical procedures',
|
| 36 |
+
'Vascular disorders', 'Blood and lymphatic system disorders',
|
| 37 |
+
'Skin and subcutaneous tissue disorders',
|
| 38 |
+
'Congenital, familial and genetic disorders', 'Infections and infestations',
|
| 39 |
+
'Respiratory, thoracic and mediastinal disorders', 'Psychiatric disorders',
|
| 40 |
+
'Renal and urinary disorders',
|
| 41 |
+
'Pregnancy, puerperium and perinatal conditions',
|
| 42 |
+
'Ear and labyrinth disorders', 'Cardiac disorders',
|
| 43 |
+
'Nervous system disorders', 'Injury, poisoning and procedural complications'
|
| 44 |
+
]
|
| 45 |
+
elif config.dataset_name == 'muv':
|
| 46 |
+
targets = [
|
| 47 |
+
'MUV-466', 'MUV-548', 'MUV-600', 'MUV-644', 'MUV-652', 'MUV-689',
|
| 48 |
+
'MUV-692', 'MUV-712', 'MUV-713', 'MUV-733', 'MUV-737', 'MUV-810',
|
| 49 |
+
'MUV-832', 'MUV-846', 'MUV-852', 'MUV-858', 'MUV-859'
|
| 50 |
+
]
|
| 51 |
+
config.n_output = len(targets)
|
| 52 |
+
|
| 53 |
+
# load dataset
|
| 54 |
+
df_train = pd.read_csv(f"{config.data_root}/train.csv")
|
| 55 |
+
df_valid = pd.read_csv(f"{config.data_root}/valid.csv")
|
| 56 |
+
df_test = pd.read_csv(f"{config.data_root}/test.csv")
|
| 57 |
+
|
| 58 |
+
# load model
|
| 59 |
+
if config.smi_ted_version == 'v1':
|
| 60 |
+
from smi_ted_light.load import load_smi_ted
|
| 61 |
+
elif config.smi_ted_version == 'v2':
|
| 62 |
+
from smi_ted_large.load import load_smi_ted
|
| 63 |
+
|
| 64 |
+
model = load_smi_ted(folder=config.model_path, ckpt_filename=config.ckpt_filename, n_output=len(targets), eval=False)
|
| 65 |
+
model.net.apply(model._init_weights)
|
| 66 |
+
print(model.net)
|
| 67 |
+
|
| 68 |
+
lr = config.lr_start*config.lr_multiplier
|
| 69 |
+
optim_groups = get_optim_groups(model, keep_decoder=bool(config.train_decoder))
|
| 70 |
+
if config.loss_fn == 'bceloss':
|
| 71 |
+
loss_function = nn.BCELoss()
|
| 72 |
+
|
| 73 |
+
# init trainer
|
| 74 |
+
trainer = TrainerClassifierMultitask(
|
| 75 |
+
raw_data=(df_train, df_valid, df_test),
|
| 76 |
+
dataset_name=config.dataset_name,
|
| 77 |
+
target=targets,
|
| 78 |
+
batch_size=config.n_batch,
|
| 79 |
+
hparams=config,
|
| 80 |
+
target_metric=config.target_metric,
|
| 81 |
+
seed=config.start_seed,
|
| 82 |
+
smi_ted_version=config.smi_ted_version,
|
| 83 |
+
checkpoints_folder=config.checkpoints_folder,
|
| 84 |
+
restart_filename=config.restart_filename,
|
| 85 |
+
device=device,
|
| 86 |
+
save_every_epoch=bool(config.save_every_epoch),
|
| 87 |
+
save_ckpt=bool(config.save_ckpt)
|
| 88 |
+
)
|
| 89 |
+
trainer.compile(
|
| 90 |
+
model=model,
|
| 91 |
+
optimizer=optim.AdamW(optim_groups, lr=lr, betas=(0.9, 0.99)),
|
| 92 |
+
loss_fn=loss_function
|
| 93 |
+
)
|
| 94 |
+
trainer.fit(max_epochs=config.max_epochs)
|
| 95 |
+
trainer.evaluate()
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
if __name__ == '__main__':
|
| 99 |
+
parser = args.get_parser()
|
| 100 |
+
config = parser.parse_args()
|
| 101 |
+
main(config)
|
models/smi_ted/finetune/finetune_regression.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Deep learning
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
from torch import optim
|
| 5 |
+
from trainers import TrainerRegressor
|
| 6 |
+
from utils import RMSELoss, get_optim_groups
|
| 7 |
+
|
| 8 |
+
# Data
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
# Standard library
|
| 13 |
+
import args
|
| 14 |
+
import os
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def main(config):
|
| 18 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 19 |
+
|
| 20 |
+
# load dataset
|
| 21 |
+
df_train = pd.read_csv(f"{config.data_root}/train.csv")
|
| 22 |
+
df_valid = pd.read_csv(f"{config.data_root}/valid.csv")
|
| 23 |
+
df_test = pd.read_csv(f"{config.data_root}/test.csv")
|
| 24 |
+
|
| 25 |
+
# load model
|
| 26 |
+
if config.smi_ted_version == 'v1':
|
| 27 |
+
from smi_ted_light.load import load_smi_ted
|
| 28 |
+
elif config.smi_ted_version == 'v2':
|
| 29 |
+
from smi_ted_large.load import load_smi_ted
|
| 30 |
+
|
| 31 |
+
model = load_smi_ted(folder=config.model_path, ckpt_filename=config.ckpt_filename, n_output=config.n_output, eval=False)
|
| 32 |
+
model.net.apply(model._init_weights)
|
| 33 |
+
print(model.net)
|
| 34 |
+
|
| 35 |
+
lr = config.lr_start*config.lr_multiplier
|
| 36 |
+
optim_groups = get_optim_groups(model, keep_decoder=bool(config.train_decoder))
|
| 37 |
+
if config.loss_fn == 'rmse':
|
| 38 |
+
loss_function = RMSELoss()
|
| 39 |
+
elif config.loss_fn == 'mae':
|
| 40 |
+
loss_function = nn.L1Loss()
|
| 41 |
+
|
| 42 |
+
# init trainer
|
| 43 |
+
trainer = TrainerRegressor(
|
| 44 |
+
raw_data=(df_train, df_valid, df_test),
|
| 45 |
+
dataset_name=config.dataset_name,
|
| 46 |
+
target=config.measure_name,
|
| 47 |
+
batch_size=config.n_batch,
|
| 48 |
+
hparams=config,
|
| 49 |
+
target_metric=config.target_metric,
|
| 50 |
+
seed=config.start_seed,
|
| 51 |
+
smi_ted_version=config.smi_ted_version,
|
| 52 |
+
checkpoints_folder=config.checkpoints_folder,
|
| 53 |
+
restart_filename=config.restart_filename,
|
| 54 |
+
device=device,
|
| 55 |
+
save_every_epoch=bool(config.save_every_epoch),
|
| 56 |
+
save_ckpt=bool(config.save_ckpt)
|
| 57 |
+
)
|
| 58 |
+
trainer.compile(
|
| 59 |
+
model=model,
|
| 60 |
+
optimizer=optim.AdamW(optim_groups, lr=lr, betas=(0.9, 0.99)),
|
| 61 |
+
loss_fn=loss_function
|
| 62 |
+
)
|
| 63 |
+
trainer.fit(max_epochs=config.max_epochs)
|
| 64 |
+
trainer.evaluate()
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
if __name__ == '__main__':
|
| 68 |
+
parser = args.get_parser()
|
| 69 |
+
config = parser.parse_args()
|
| 70 |
+
main(config)
|
models/smi_ted/finetune/moleculenet/bace/test.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3af97c680375dd09349c63b4779b35166212302e79e4fc7a1752ef5d71cf35b
|
| 3 |
+
size 400436
|
models/smi_ted/finetune/moleculenet/bace/train.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b5b3426e84dc7e2f40f2cf9d15d4d38328126c07f49c215cfb4fb657f69200de
|
| 3 |
+
size 3109699
|
models/smi_ted/finetune/moleculenet/bace/valid.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:813c8f2af5a1058568cf60b7021b8b2cd818a17944afd0b09f9d838e36ee985d
|
| 3 |
+
size 397085
|
models/smi_ted/finetune/moleculenet/bbbp/test.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cca4161c44535fd0f8ff917cc68d26703da7fbce19ddecb7dc5f7ae4b4d241a6
|
| 3 |
+
size 14874
|
models/smi_ted/finetune/moleculenet/bbbp/train.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7300807bf21ea1177efd81c218e43275ed00b6c3006b5dae7625f774edb6b1a6
|
| 3 |
+
size 115549
|
models/smi_ted/finetune/moleculenet/bbbp/valid.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:af39cc3735a356010a072e1e196a64eca6e0d88f0b2a023d4dc1adba7030ce40
|
| 3 |
+
size 15655
|
models/smi_ted/finetune/moleculenet/biodegradability/biodeg_example.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c98992c1c22ae7468a41fb7bc86c775ccc30fa29e50053bb148ffc2f2d95551e
|
| 3 |
+
size 6352
|
models/smi_ted/finetune/moleculenet/biodegradability/biodegradability.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ec61887444a0e8925b16cca48433c3b3bff1ac5cf08f448d6b64bbdbc14a318
|
| 3 |
+
size 416181
|
models/smi_ted/finetune/moleculenet/biodegradability/test.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86c2f7f39add0fff77358454c0f1b289a233e4a78d50b7f005ec2dc1c632d473
|
| 3 |
+
size 84488
|
models/smi_ted/finetune/moleculenet/biodegradability/train.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a4a94ae0f8c134ce10f2d853eced84d031a4e7b394662344a9141e7567b3eb2
|
| 3 |
+
size 252230
|
models/smi_ted/finetune/moleculenet/biodegradability/valid.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09e827ee7e55544f5b327d5e2ef2d9fe09e3f62024e1316b6e71d1fc9be275a1
|
| 3 |
+
size 85290
|
models/smi_ted/finetune/moleculenet/clintox/test.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:963a05e8eeaaa38fd3688f448dfc28cd0917ea280b1b9cb5b4297244f7f68fe2
|
| 3 |
+
size 10219
|
models/smi_ted/finetune/moleculenet/clintox/train.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04bbee4a0d7fb4942292c9581f318909d06508d529a4a3a76590e6749417c1a7
|
| 3 |
+
size 74357
|
models/smi_ted/finetune/moleculenet/clintox/valid.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3e2b9ab566ffc184c0590002bfbd6a42e6522209e6d6271968262844dde2905
|
| 3 |
+
size 10255
|
models/smi_ted/finetune/moleculenet/esol/test.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7da41a7eab447fdfd163292b4a5eb8ef09a747fc82b0f1cc5c468e46b1b2ef5a
|
| 3 |
+
size 9999
|
models/smi_ted/finetune/moleculenet/esol/train.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:784ba31de05a43ecab98260c94a47e2c807f4d65c0f93d9a88fbd962515976c5
|
| 3 |
+
size 77154
|
models/smi_ted/finetune/moleculenet/esol/valid.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc30e7fa1f774e27ed56de7cfd77e21f07a5a2c38fcc6d928c0084a9a99181e5
|
| 3 |
+
size 9892
|
models/smi_ted/finetune/moleculenet/freesolv/test.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c8212c391ccbff3722a11d1bd3752b3a9dd187f2a7b33f8b9d2d594950b188d7
|
| 3 |
+
size 3223
|
models/smi_ted/finetune/moleculenet/freesolv/train.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3b781e5d03dbd7d272347288161f92e8e66c628da50e3e2bc06de12225de22d
|
| 3 |
+
size 25053
|
models/smi_ted/finetune/moleculenet/freesolv/valid.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b35d9c13a02291eefe85bd4b048ccc28f5326a3b018beb937aba12067b072d2
|
| 3 |
+
size 3151
|