Spaces:
Running
Running
| # Deep learning | |
| import torch | |
| from torch.utils.data import Dataset | |
| from sklearn.metrics import confusion_matrix | |
| # Data | |
| import pandas as pd | |
| import numpy as np | |
| # Standard library | |
| import os | |
| # Chemistry | |
| from rdkit import Chem | |
| from rdkit.Chem import PandasTools | |
| from rdkit.Chem import Descriptors | |
| PandasTools.RenderImagesInAllDataFrames(True) | |
| def normalize_smiles(smi, canonical=True, isomeric=False): | |
| try: | |
| normalized = Chem.MolToSmiles( | |
| Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric | |
| ) | |
| except: | |
| normalized = None | |
| return normalized | |
| class RMSELoss: | |
| def __init__(self): | |
| pass | |
| def __call__(self, yhat, y): | |
| return torch.sqrt(torch.mean((yhat-y)**2)) | |
| def RMSE(predictions, targets): | |
| return np.sqrt(((predictions - targets) ** 2).mean()) | |
| def sensitivity(y_true, y_pred): | |
| tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() | |
| return (tp/(tp+fn)) | |
| def specificity(y_true, y_pred): | |
| tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() | |
| return (tn/(tn+fp)) | |
| def get_optim_groups(module, keep_decoder=False): | |
| # setup optimizer | |
| # separate out all parameters to those that will and won't experience regularizing weight decay | |
| decay = set() | |
| no_decay = set() | |
| whitelist_weight_modules = (torch.nn.Linear,) | |
| blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding) | |
| for mn, m in module.named_modules(): | |
| for pn, p in m.named_parameters(): | |
| fpn = '%s.%s' % (mn, pn) if mn else pn # full param name | |
| if not keep_decoder and 'decoder' in fpn: # exclude decoder components | |
| continue | |
| if pn.endswith('bias'): | |
| # all biases will not be decayed | |
| no_decay.add(fpn) | |
| elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules): | |
| # weights of whitelist modules will be weight decayed | |
| decay.add(fpn) | |
| elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules): | |
| # weights of blacklist modules will NOT be weight decayed | |
| no_decay.add(fpn) | |
| # validate that we considered every parameter | |
| param_dict = {pn: p for pn, p in module.named_parameters()} | |
| # create the pytorch optimizer object | |
| optim_groups = [ | |
| {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": 0.0}, | |
| {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0}, | |
| ] | |
| return optim_groups | |
| class CustomDataset(Dataset): | |
| def __init__(self, dataset, target): | |
| self.dataset = dataset | |
| self.target = target | |
| def __len__(self): | |
| return len(self.dataset) | |
| def __getitem__(self, idx): | |
| smiles = self.dataset['canon_smiles'].iloc[idx] | |
| labels = self.dataset[self.target].iloc[idx] | |
| return smiles, labels | |
| class CustomDatasetMultitask(Dataset): | |
| def __init__(self, dataset, targets): | |
| self.dataset = dataset | |
| self.targets = targets | |
| def __len__(self): | |
| return len(self.dataset) | |
| def __getitem__(self, idx): | |
| smiles = self.dataset['canon_smiles'].iloc[idx] | |
| labels = self.dataset[self.targets].iloc[idx].to_numpy() | |
| mask = [0.0 if np.isnan(x) else 1.0 for x in labels] | |
| labels = [0.0 if np.isnan(x) else x for x in labels] | |
| return smiles, torch.tensor(labels, dtype=torch.float32), torch.tensor(mask) |