biocynthia-demo / data_utils.py
latticetower's picture
fix typos
ca699c2
"""Temporary placeholder to load
"""
from dataclasses import dataclass
from typing import List
import pandas as pd
import numpy as np
@dataclass
class Domain:
name: str
sequence: str
def to_dict(self):
return {'name': self.name, 'sequence': self.sequence}
DOMAIN_NAMES = [
'Condensation', 'Heterocyclization', 'AMP-binding', 'PCP',
'Epimerization', 'Thioesterase'
]
@dataclass
class NRPSModule:
module_id: int
domains_list: List[Domain]
is_start: bool = False
is_final: bool = False
@classmethod
def parse(cls, row):
# print(row)
domains_list = []
is_start = True
is_final = False
for domain_name in DOMAIN_NAMES:
seq = row[domain_name]
if isinstance(seq, str):
domain = Domain(domain_name, seq)
domains_list.append(domain)
if domain_name in ('Condensation', 'Heterocyclization'): # elongation module
is_start = False
if domain_name == "Thioesterase":
is_final = True
return cls(row.module_id, domains_list, is_start, is_final)
def to_list(self):
return [domain.to_dict() for domain in self.domains_list]
def to_start_module(self):
if self.is_start:
return self
domains_list = self.domains_list
domains_list = [domain for domain in domains_list if not domain.name in ('Condensation', 'Heterocyclization')]
domains_list = [domain for domain in domains_list if domain.name != 'Thioesterase']
return NRPSModule(self.module_id, domains_list, is_start=True, is_final=False)
def load_monomers(filename: str = "./data/monomers_list.txt") -> list[str]:
"""
Loads monomer names from a file.
"""
monomers = []
with open(filename, 'r') as file:
for line in file:
line = line.strip()
if len(line) > 0:
monomers.append(line)
return monomers
# Login using e.g. `huggingface-cli login` to access this dataset
class PseudoGenerationWrapper:
"""
Let's start with the wrapper which picks one of the modules randomly from the predefined collection.
"""
def __init__(self) -> None:
nrp_modules_df = pd.read_csv("hf://datasets/latticetower/nrps_modules_asdb4.0/nrps_modules_info_cleaned.csv")
self.monomer2modules_list = self.prepare_data(nrp_modules_df)
def prepare_data(self, df: pd.DataFrame) -> dict:
monomer2modules = dict()
for monomer, row_df in df.groupby("monomer_name"):
modules_lists = {'start': [], 'elongation': [], 'final': []}
# let's suppose that there is a difference between module composition aside from presence/absence of specific domains,
# which is yet unknown
for row_index, row in row_df.iterrows():
nrps_module = NRPSModule.parse(row)
if nrps_module.is_start:
modules_lists['start'].append(nrps_module)
elif nrps_module.is_final:
modules_lists['final'].append(nrps_module)
else:
modules_lists['elongation'].append(nrps_module)
monomer2modules[monomer] = modules_lists
return monomer2modules
def suggest_module(self, monomer: str, is_start=False, is_final=False):
"""for some monomer simply returns random module (that's a dirty hack you didn't expect!) from the list of known ones.
Flags is_start and is_final are needed to take into account specific structure of start and final modules
"""
assert monomer in self.monomer2modules_list, f"{monomer} is unknown"
monomer_dict = self.monomer2modules_list[monomer]
# in fact the following might be simply generated
# (we can try to omin condensation domain and call it a starting module, for example),
# but for simplicity distinguish start, elongation and final parts.
# i didn't processed the case where there is only 1 module present, let's ignore this for now
# (we consider peptides consisted from one monomer to be uninteresting ones and simply omit)
if is_start:
candidate_modules = monomer_dict['start']
if len(candidate_modules) > 0:
candidate_module = np.random.choice(candidate_modules, 1)
return candidate_module[0].to_list()
candidate_modules = monomer_dict['elongation']
assert len(candidate_modules) > 0, f"{monomer} has no known information on starting modules"
candidate_module = np.random.choice(candidate_modules, 1)[0].to_start_module()
return candidate_module.to_list()
if is_final:
candidate_modules = monomer_dict['final']
if len(candidate_modules) > 0:
candidate_module = np.random.choice(candidate_modules, 1)
return candidate_module[0].to_list()
candidate_modules = monomer_dict['elongation']
assert len(candidate_modules) > 0, f"{monomer} has no known information on final modules"
candidate_module = np.random.choice(candidate_modules, 1)[0]
return candidate_module.to_list()
candidate_modules = monomer_dict['elongation']
assert len(candidate_modules) > 0, f"{monomer} has no known information on elongation modules"
return np.random.choice(candidate_modules, 1)[0].to_list()
module_generator = PseudoGenerationWrapper()