biocynthia-demo / app.py
latticetower's picture
fix typos
ca699c2
from typing import List, Dict
import numpy as np
import gradio as gr
import data_utils
def smiles2monomers(smiles: str) -> list[str]:
"""
Converts SMILES representation of a molecule to a list of monomers (also SMILES). Not implemented yet.
"""
return []
def generate_monomers(num_monomers: int, monomers_vocab: List[str]) -> list[str]:
"""
Produces random list of monomers with num_monomers elements. Not implemented yet.
"""
return np.random.choice(monomers_vocab, num_monomers)
def monomer2domains(monomer: str, is_start=False, is_final=False) -> list[str]:
"""
Converts a monomer to a list of domains.
Not implemented yet.
"""
nrps_module = data_utils.module_generator.suggest_module(monomer, is_start=is_start, is_final=is_final)
return nrps_module
def monomers2modules(monomer_list: list[str], is_cyclic: bool=False) -> List[List[Dict[str, str]]]:
"""
Cyclicity flag is always ignored, since I haven't figured out yet how this is encoded in NRPS domains sequence.
"""
modules_data = []
for index, monomer in enumerate(monomer_list):
is_start = index == 0
is_final = index == len(monomer_list)
domains_list = monomer2domains(monomer, is_start=is_start, is_final=is_final)
modules_data.append(domains_list)
return modules_data
# def find_bacteria(monomers: list[str]) -> list[str]:
# """
# Finds bacteria which might produce the target peptide.
# Input: sequence of possible domains. Each domain is represented as a protein sequence.
# Output: possible hist from blastp search.
# """
# return []
# def letter_counter(word, letter):
# """Count the occurrences of a specific letter in a word.
# Args:
# word: The word or phrase to analyze
# letter: The letter to count occurrences of
# Returns:
# The number of times the letter appears in the word
# """
# return word.lower().count(letter.lower())
def convert_to_fasta(modules_list):
fasta_lines = []
for imodule, module_data in enumerate(modules_list):
for idomain, domain_data in enumerate(module_data):
name = domain_data['name']
sequence = domain_data['sequence']
fasta_lines.extend(
[
f">module_{imodule:02d}_domain_{idomain:02d}_{name}",
sequence
]
)
return '\n'.join(fasta_lines)
def generate_peptide_monomers(num_monomers: int):
"""Produces the peptide constructed from specified number of monomer fragments.
Currently the sequence is picked randomly from predefined collection of monomers (aminoacids and their D- isomers).
Args:
num_monomers: The number of monomer fragments in the resulting 'peptide'
Returns:
The string which is constructed from specified number of monomer fragments separated by commas,
and the data for the corresponding domain sequences (for future searches with blastp, in .fasta format).
"""
MONOMER_NAMES = data_utils.load_monomers()
monomers_list = generate_monomers(num_monomers, MONOMER_NAMES)
modules_data = monomers2modules(monomers_list)
#print(modules_data)
return ",".join(monomers_list), convert_to_fasta(modules_data)
if __name__ == "__main__":
# demo = gr.Interface(
# fn=letter_counter,
# inputs=["text", "text"],
# outputs="number",
# title="Letter Counter",
# description="Count how many times a letter appears in a word"
# )
with gr.Blocks(title="NRPS domains 'generator'") as demo:
gr.Markdown("""# BioCynthia
```
There are bacteria in soil and sea
They have what is called a B-G-C
These genes produce some complex peptides
And they might save our lives!
```
""")
gr.Markdown("For more details on project goals and motivation, please refer to the README.md")
frequency_slider = gr.Slider(
minimum=2,
maximum=10,
step=1,
value=3,
label="Number of monomers in the target peptide"
)
gr.Interface(
fn=generate_peptide_monomers,
inputs=[frequency_slider],
outputs=["text", "text"],
)
demo.launch(mcp_server=True)