|
"""Temporary placeholder to load |
|
""" |
|
from dataclasses import dataclass |
|
from typing import List |
|
|
|
import pandas as pd |
|
import numpy as np |
|
|
|
|
|
@dataclass |
|
class Domain: |
|
name: str |
|
sequence: str |
|
def to_dict(self): |
|
return {'name': self.name, 'sequence': self.sequence} |
|
|
|
|
|
DOMAIN_NAMES = [ |
|
'Condensation', 'Heterocyclization', 'AMP-binding', 'PCP', |
|
'Epimerization', 'Thioesterase' |
|
] |
|
|
|
@dataclass |
|
class NRPSModule: |
|
module_id: int |
|
domains_list: List[Domain] |
|
is_start: bool = False |
|
is_final: bool = False |
|
|
|
@classmethod |
|
def parse(cls, row): |
|
|
|
domains_list = [] |
|
is_start = True |
|
is_final = False |
|
for domain_name in DOMAIN_NAMES: |
|
seq = row[domain_name] |
|
if isinstance(seq, str): |
|
domain = Domain(domain_name, seq) |
|
domains_list.append(domain) |
|
if domain_name in ('Condensation', 'Heterocyclization'): |
|
is_start = False |
|
if domain_name == "Thioesterase": |
|
is_final = True |
|
return cls(row.module_id, domains_list, is_start, is_final) |
|
|
|
def to_list(self): |
|
return [domain.to_dict() for domain in self.domains_list] |
|
|
|
def to_start_module(self): |
|
if self.is_start: |
|
return self |
|
domains_list = self.domains_list |
|
domains_list = [domain for domain in domains_list if not domain.name in ('Condensation', 'Heterocyclization')] |
|
domains_list = [domain for domain in domains_list if domain.name != 'Thioesterase'] |
|
return NRPSModule(self.module_id, domains_list, is_start=True, is_final=False) |
|
|
|
|
|
def load_monomers(filename: str = "./data/monomers_list.txt") -> list[str]: |
|
""" |
|
Loads monomer names from a file. |
|
""" |
|
monomers = [] |
|
with open(filename, 'r') as file: |
|
for line in file: |
|
line = line.strip() |
|
if len(line) > 0: |
|
monomers.append(line) |
|
return monomers |
|
|
|
|
|
|
|
|
|
class PseudoGenerationWrapper: |
|
""" |
|
Let's start with the wrapper which picks one of the modules randomly from the predefined collection. |
|
""" |
|
def __init__(self) -> None: |
|
nrp_modules_df = pd.read_csv("hf://datasets/latticetower/nrps_modules_asdb4.0/nrps_modules_info_cleaned.csv") |
|
self.monomer2modules_list = self.prepare_data(nrp_modules_df) |
|
|
|
def prepare_data(self, df: pd.DataFrame) -> dict: |
|
monomer2modules = dict() |
|
for monomer, row_df in df.groupby("monomer_name"): |
|
modules_lists = {'start': [], 'elongation': [], 'final': []} |
|
|
|
|
|
for row_index, row in row_df.iterrows(): |
|
nrps_module = NRPSModule.parse(row) |
|
if nrps_module.is_start: |
|
modules_lists['start'].append(nrps_module) |
|
elif nrps_module.is_final: |
|
modules_lists['final'].append(nrps_module) |
|
else: |
|
modules_lists['elongation'].append(nrps_module) |
|
monomer2modules[monomer] = modules_lists |
|
return monomer2modules |
|
|
|
def suggest_module(self, monomer: str, is_start=False, is_final=False): |
|
"""for some monomer simply returns random module (that's a dirty hack you didn't expect!) from the list of known ones. |
|
|
|
Flags is_start and is_final are needed to take into account specific structure of start and final modules |
|
""" |
|
assert monomer in self.monomer2modules_list, f"{monomer} is unknown" |
|
|
|
monomer_dict = self.monomer2modules_list[monomer] |
|
|
|
|
|
|
|
|
|
|
|
if is_start: |
|
candidate_modules = monomer_dict['start'] |
|
if len(candidate_modules) > 0: |
|
candidate_module = np.random.choice(candidate_modules, 1) |
|
return candidate_module[0].to_list() |
|
candidate_modules = monomer_dict['elongation'] |
|
assert len(candidate_modules) > 0, f"{monomer} has no known information on starting modules" |
|
candidate_module = np.random.choice(candidate_modules, 1)[0].to_start_module() |
|
return candidate_module.to_list() |
|
|
|
if is_final: |
|
candidate_modules = monomer_dict['final'] |
|
if len(candidate_modules) > 0: |
|
candidate_module = np.random.choice(candidate_modules, 1) |
|
return candidate_module[0].to_list() |
|
candidate_modules = monomer_dict['elongation'] |
|
assert len(candidate_modules) > 0, f"{monomer} has no known information on final modules" |
|
candidate_module = np.random.choice(candidate_modules, 1)[0] |
|
return candidate_module.to_list() |
|
|
|
candidate_modules = monomer_dict['elongation'] |
|
assert len(candidate_modules) > 0, f"{monomer} has no known information on elongation modules" |
|
return np.random.choice(candidate_modules, 1)[0].to_list() |
|
|
|
|
|
|
|
|
|
module_generator = PseudoGenerationWrapper() |