Spaces:

Agents-MCP-Hackathon
/

biocynthia-demo

Sleeping

App Files Files Community

biocynthia-demo / data_utils.py

latticetower

fix typos

ca699c2 19 days ago

raw

history blame contribute delete

5.61 kB

	"""Temporary placeholder to load
	"""
	from dataclasses import dataclass
	from typing import List

	import pandas as pd
	import numpy as np


	@dataclass
	class Domain:
	name: str
	sequence: str
	def to_dict(self):
	return {'name': self.name, 'sequence': self.sequence}


	DOMAIN_NAMES = [
	'Condensation', 'Heterocyclization', 'AMP-binding', 'PCP',
	'Epimerization', 'Thioesterase'
	]

	@dataclass
	class NRPSModule:
	module_id: int
	domains_list: List[Domain]
	is_start: bool = False
	is_final: bool = False

	@classmethod
	def parse(cls, row):
	# print(row)
	domains_list = []
	is_start = True
	is_final = False
	for domain_name in DOMAIN_NAMES:
	seq = row[domain_name]
	if isinstance(seq, str):
	domain = Domain(domain_name, seq)
	domains_list.append(domain)
	if domain_name in ('Condensation', 'Heterocyclization'): # elongation module
	is_start = False
	if domain_name == "Thioesterase":
	is_final = True
	return cls(row.module_id, domains_list, is_start, is_final)

	def to_list(self):
	return [domain.to_dict() for domain in self.domains_list]

	def to_start_module(self):
	if self.is_start:
	return self
	domains_list = self.domains_list
	domains_list = [domain for domain in domains_list if not domain.name in ('Condensation', 'Heterocyclization')]
	domains_list = [domain for domain in domains_list if domain.name != 'Thioesterase']
	return NRPSModule(self.module_id, domains_list, is_start=True, is_final=False)


	def load_monomers(filename: str = "./data/monomers_list.txt") -> list[str]:
	"""
	Loads monomer names from a file.
	"""
	monomers = []
	with open(filename, 'r') as file:
	for line in file:
	line = line.strip()
	if len(line) > 0:
	monomers.append(line)
	return monomers



	# Login using e.g. `huggingface-cli login` to access this dataset
	class PseudoGenerationWrapper:
	"""
	Let's start with the wrapper which picks one of the modules randomly from the predefined collection.
	"""
	def __init__(self) -> None:
	nrp_modules_df = pd.read_csv("hf://datasets/latticetower/nrps_modules_asdb4.0/nrps_modules_info_cleaned.csv")
	self.monomer2modules_list = self.prepare_data(nrp_modules_df)

	def prepare_data(self, df: pd.DataFrame) -> dict:
	monomer2modules = dict()
	for monomer, row_df in df.groupby("monomer_name"):
	modules_lists = {'start': [], 'elongation': [], 'final': []}
	# let's suppose that there is a difference between module composition aside from presence/absence of specific domains,
	# which is yet unknown
	for row_index, row in row_df.iterrows():
	nrps_module = NRPSModule.parse(row)
	if nrps_module.is_start:
	modules_lists['start'].append(nrps_module)
	elif nrps_module.is_final:
	modules_lists['final'].append(nrps_module)
	else:
	modules_lists['elongation'].append(nrps_module)
	monomer2modules[monomer] = modules_lists
	return monomer2modules

	def suggest_module(self, monomer: str, is_start=False, is_final=False):
	"""for some monomer simply returns random module (that's a dirty hack you didn't expect!) from the list of known ones.

	Flags is_start and is_final are needed to take into account specific structure of start and final modules
	"""
	assert monomer in self.monomer2modules_list, f"{monomer} is unknown"

	monomer_dict = self.monomer2modules_list[monomer]
	# in fact the following might be simply generated
	# (we can try to omin condensation domain and call it a starting module, for example),
	# but for simplicity distinguish start, elongation and final parts.
	# i didn't processed the case where there is only 1 module present, let's ignore this for now
	# (we consider peptides consisted from one monomer to be uninteresting ones and simply omit)
	if is_start:
	candidate_modules = monomer_dict['start']
	if len(candidate_modules) > 0:
	candidate_module = np.random.choice(candidate_modules, 1)
	return candidate_module[0].to_list()
	candidate_modules = monomer_dict['elongation']
	assert len(candidate_modules) > 0, f"{monomer} has no known information on starting modules"
	candidate_module = np.random.choice(candidate_modules, 1)[0].to_start_module()
	return candidate_module.to_list()

	if is_final:
	candidate_modules = monomer_dict['final']
	if len(candidate_modules) > 0:
	candidate_module = np.random.choice(candidate_modules, 1)
	return candidate_module[0].to_list()
	candidate_modules = monomer_dict['elongation']
	assert len(candidate_modules) > 0, f"{monomer} has no known information on final modules"
	candidate_module = np.random.choice(candidate_modules, 1)[0]
	return candidate_module.to_list()

	candidate_modules = monomer_dict['elongation']
	assert len(candidate_modules) > 0, f"{monomer} has no known information on elongation modules"
	return np.random.choice(candidate_modules, 1)[0].to_list()




	module_generator = PseudoGenerationWrapper()