|
from typing import List, Dict |
|
import numpy as np |
|
import gradio as gr |
|
|
|
import data_utils |
|
|
|
|
|
|
|
def smiles2monomers(smiles: str) -> list[str]: |
|
""" |
|
Converts SMILES representation of a molecule to a list of monomers (also SMILES). Not implemented yet. |
|
""" |
|
return [] |
|
|
|
|
|
def generate_monomers(num_monomers: int, monomers_vocab: List[str]) -> list[str]: |
|
""" |
|
Produces random list of monomers with num_monomers elements. Not implemented yet. |
|
""" |
|
return np.random.choice(monomers_vocab, num_monomers) |
|
|
|
|
|
def monomer2domains(monomer: str, is_start=False, is_final=False) -> list[str]: |
|
""" |
|
Converts a monomer to a list of domains. |
|
Not implemented yet. |
|
""" |
|
nrps_module = data_utils.module_generator.suggest_module(monomer, is_start=is_start, is_final=is_final) |
|
return nrps_module |
|
|
|
|
|
def monomers2modules(monomer_list: list[str], is_cyclic: bool=False) -> List[List[Dict[str, str]]]: |
|
""" |
|
Cyclicity flag is always ignored, since I haven't figured out yet how this is encoded in NRPS domains sequence. |
|
""" |
|
modules_data = [] |
|
for index, monomer in enumerate(monomer_list): |
|
is_start = index == 0 |
|
is_final = index == len(monomer_list) |
|
domains_list = monomer2domains(monomer, is_start=is_start, is_final=is_final) |
|
modules_data.append(domains_list) |
|
return modules_data |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_to_fasta(modules_list): |
|
fasta_lines = [] |
|
for imodule, module_data in enumerate(modules_list): |
|
for idomain, domain_data in enumerate(module_data): |
|
name = domain_data['name'] |
|
sequence = domain_data['sequence'] |
|
fasta_lines.extend( |
|
[ |
|
f">module_{imodule:02d}_domain_{idomain:02d}_{name}", |
|
sequence |
|
] |
|
) |
|
return '\n'.join(fasta_lines) |
|
|
|
|
|
def generate_peptide_monomers(num_monomers: int): |
|
"""Produces the peptide constructed from specified number of monomer fragments. |
|
Currently the sequence is picked randomly from predefined collection of monomers (aminoacids and their D- isomers). |
|
|
|
Args: |
|
num_monomers: The number of monomer fragments in the resulting 'peptide' |
|
|
|
Returns: |
|
The string which is constructed from specified number of monomer fragments separated by commas, |
|
and the data for the corresponding domain sequences (for future searches with blastp, in .fasta format). |
|
|
|
""" |
|
MONOMER_NAMES = data_utils.load_monomers() |
|
monomers_list = generate_monomers(num_monomers, MONOMER_NAMES) |
|
modules_data = monomers2modules(monomers_list) |
|
|
|
|
|
return ",".join(monomers_list), convert_to_fasta(modules_data) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="NRPS domains 'generator'") as demo: |
|
gr.Markdown("""# BioCynthia |
|
``` |
|
There are bacteria in soil and sea |
|
They have what is called a B-G-C |
|
These genes produce some complex peptides |
|
And they might save our lives! |
|
``` |
|
""") |
|
|
|
gr.Markdown("For more details on project goals and motivation, please refer to the README.md") |
|
|
|
frequency_slider = gr.Slider( |
|
minimum=2, |
|
maximum=10, |
|
step=1, |
|
value=3, |
|
label="Number of monomers in the target peptide" |
|
) |
|
gr.Interface( |
|
fn=generate_peptide_monomers, |
|
inputs=[frequency_slider], |
|
outputs=["text", "text"], |
|
) |
|
|
|
|
|
demo.launch(mcp_server=True) |
|
|