"""Temporary placeholder to load """ from dataclasses import dataclass from typing import List import pandas as pd import numpy as np @dataclass class Domain: name: str sequence: str def to_dict(self): return {'name': self.name, 'sequence': self.sequence} DOMAIN_NAMES = [ 'Condensation', 'Heterocyclization', 'AMP-binding', 'PCP', 'Epimerization', 'Thioesterase' ] @dataclass class NRPSModule: module_id: int domains_list: List[Domain] is_start: bool = False is_final: bool = False @classmethod def parse(cls, row): # print(row) domains_list = [] is_start = True is_final = False for domain_name in DOMAIN_NAMES: seq = row[domain_name] if isinstance(seq, str): domain = Domain(domain_name, seq) domains_list.append(domain) if domain_name in ('Condensation', 'Heterocyclization'): # elongation module is_start = False if domain_name == "Thioesterase": is_final = True return cls(row.module_id, domains_list, is_start, is_final) def to_list(self): return [domain.to_dict() for domain in self.domains_list] def to_start_module(self): if self.is_start: return self domains_list = self.domains_list domains_list = [domain for domain in domains_list if not domain.name in ('Condensation', 'Heterocyclization')] domains_list = [domain for domain in domains_list if domain.name != 'Thioesterase'] return NRPSModule(self.module_id, domains_list, is_start=True, is_final=False) def load_monomers(filename: str = "./data/monomers_list.txt") -> list[str]: """ Loads monomer names from a file. """ monomers = [] with open(filename, 'r') as file: for line in file: line = line.strip() if len(line) > 0: monomers.append(line) return monomers # Login using e.g. `huggingface-cli login` to access this dataset class PseudoGenerationWrapper: """ Let's start with the wrapper which picks one of the modules randomly from the predefined collection. """ def __init__(self) -> None: nrp_modules_df = pd.read_csv("hf://datasets/latticetower/nrps_modules_asdb4.0/nrps_modules_info_cleaned.csv") self.monomer2modules_list = self.prepare_data(nrp_modules_df) def prepare_data(self, df: pd.DataFrame) -> dict: monomer2modules = dict() for monomer, row_df in df.groupby("monomer_name"): modules_lists = {'start': [], 'elongation': [], 'final': []} # let's suppose that there is a difference between module composition aside from presence/absence of specific domains, # which is yet unknown for row_index, row in row_df.iterrows(): nrps_module = NRPSModule.parse(row) if nrps_module.is_start: modules_lists['start'].append(nrps_module) elif nrps_module.is_final: modules_lists['final'].append(nrps_module) else: modules_lists['elongation'].append(nrps_module) monomer2modules[monomer] = modules_lists return monomer2modules def suggest_module(self, monomer: str, is_start=False, is_final=False): """for some monomer simply returns random module (that's a dirty hack you didn't expect!) from the list of known ones. Flags is_start and is_final are needed to take into account specific structure of start and final modules """ assert monomer in self.monomer2modules_list, f"{monomer} is unknown" monomer_dict = self.monomer2modules_list[monomer] # in fact the following might be simply generated # (we can try to omin condensation domain and call it a starting module, for example), # but for simplicity distinguish start, elongation and final parts. # i didn't processed the case where there is only 1 module present, let's ignore this for now # (we consider peptides consisted from one monomer to be uninteresting ones and simply omit) if is_start: candidate_modules = monomer_dict['start'] if len(candidate_modules) > 0: candidate_module = np.random.choice(candidate_modules, 1) return candidate_module[0].to_list() candidate_modules = monomer_dict['elongation'] assert len(candidate_modules) > 0, f"{monomer} has no known information on starting modules" candidate_module = np.random.choice(candidate_modules, 1)[0].to_start_module() return candidate_module.to_list() if is_final: candidate_modules = monomer_dict['final'] if len(candidate_modules) > 0: candidate_module = np.random.choice(candidate_modules, 1) return candidate_module[0].to_list() candidate_modules = monomer_dict['elongation'] assert len(candidate_modules) > 0, f"{monomer} has no known information on final modules" candidate_module = np.random.choice(candidate_modules, 1)[0] return candidate_module.to_list() candidate_modules = monomer_dict['elongation'] assert len(candidate_modules) > 0, f"{monomer} has no known information on elongation modules" return np.random.choice(candidate_modules, 1)[0].to_list() module_generator = PseudoGenerationWrapper()