Spaces:
Sleeping
Sleeping
File size: 5,690 Bytes
f8080fc b93c8a7 19dfa7a b93c8a7 19dfa7a f8080fc 19dfa7a f8080fc 83811e8 f8080fc 19dfa7a f8080fc 19dfa7a b93c8a7 f8080fc b93c8a7 f8080fc 19dfa7a b93c8a7 19dfa7a b93c8a7 19dfa7a f8080fc 19dfa7a f8080fc c45ba32 f8080fc b93c8a7 f8080fc 19dfa7a b93c8a7 f8080fc b93c8a7 f8080fc b93c8a7 f8080fc b93c8a7 f8080fc b93c8a7 19dfa7a f8080fc b93c8a7 f8080fc 19dfa7a f8080fc 19dfa7a f8080fc b93c8a7 f8080fc b93c8a7 f8080fc 19dfa7a f8080fc f98cc68 f8080fc b93c8a7 19dfa7a f8080fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import gradio as gr
import torch
from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
from mammal.keys import (
CLS_PRED,
ENCODER_INPUTS_ATTENTION_MASK,
ENCODER_INPUTS_STR,
ENCODER_INPUTS_TOKENS,
SCORES,
)
from mammal.model import Mammal
from mammal_demo.demo_framework import MammalObjectBroker, MammalTask
class PpiTask(MammalTask):
def __init__(self, model_dict):
super().__init__(name="Protein-Protein Interaction", model_dict=model_dict)
self.description = "Protein-Protein Interaction (PPI)"
self.examples = {
"protein_calmodulin": "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK",
"protein_calcineurin": "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ",
}
self.markup_text = f"""
# Mammal based {self.description} demonstration
Given two protein sequences, estimate if the proteins interact or not."""
def generate_prompt(self, protein_seq_1, protein_seq_2):
"""Formatting prompt to match pre-training syntax
Args:
protein_seq_1 (str): sequance of protein number 1
protein_seq_2 (str): sequance of protein number 2
Returns:
str: prompt
"""
prompt = (
"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
+ f"<SEQUENCE_NATURAL_START>{protein_seq_1}<SEQUENCE_NATURAL_END>"
+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
+ f"<SEQUENCE_NATURAL_START>{protein_seq_2}<SEQUENCE_NATURAL_END><EOS>"
)
return prompt
def crate_sample_dict(self, sample_inputs: dict, model_holder: MammalObjectBroker):
# Create and load sample
sample_dict = dict()
prompt = self.generate_prompt(**sample_inputs)
sample_dict[ENCODER_INPUTS_STR] = prompt
# Tokenize
sample_dict = model_holder.tokenizer_op(
sample_dict=sample_dict,
key_in=ENCODER_INPUTS_STR,
key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
)
sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
sample_dict[ENCODER_INPUTS_TOKENS]
)
sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
)
return sample_dict
def run_model(self, sample_dict, model: Mammal):
# Generate Prediction
batch_dict = model.generate(
[sample_dict],
output_scores=True,
return_dict_in_generate=True,
max_new_tokens=5,
)
return batch_dict
def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp) -> list:
# Get output
generated_output = tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
score = batch_dict[SCORES][0][1][self.positive_token_id(tokenizer_op)].item()
ans = [generated_output, score]
return ans
def create_and_run_prompt(self, model_name, protein_seq_1, protein_seq_2):
model_holder = self.model_dict[model_name]
sample_inputs = {"protein_seq_1": protein_seq_1, "protein_seq_2": protein_seq_2}
sample_dict = self.crate_sample_dict(
sample_inputs=sample_inputs, model_holder=model_holder
)
prompt = sample_dict[ENCODER_INPUTS_STR]
batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
res = prompt, *self.decode_output(
batch_dict, tokenizer_op=model_holder.tokenizer_op
)
return res
def create_demo(self, model_name_widget: gr.component):
# """
# ### Using the model from
# ```{model} ```
# """
with gr.Group() as demo:
gr.Markdown(self.markup_text)
with gr.Row():
protein_seq_1 = gr.Textbox(
label="Protein 1 sequence",
# info="standard",
interactive=True,
lines=3,
value=self.examples["protein_calmodulin"],
)
protein_seq_2 = gr.Textbox(
label="Protein 2 sequence",
# info="standard",
interactive=True,
lines=3,
value=self.examples["protein_calcineurin"],
)
with gr.Row():
run_mammal: gr.Button = gr.Button(
"Run Mammal prompt for Protein-Protein Interaction",
variant="primary",
)
with gr.Row():
prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
with gr.Row():
decoded = gr.Textbox(label="Mammal output")
score_box = gr.Number(label="PPI score")
run_mammal.click(
fn=self.create_and_run_prompt,
inputs=[model_name_widget, protein_seq_1, protein_seq_2],
outputs=[prompt_box, decoded, score_box],
)
with gr.Row():
gr.Markdown(
"```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
)
demo.visible = False
return demo
|