biomed-multi-alignment

Runtime error

App Files Files Community

biomed-multi-alignment / mammal_demo /ppi_task.py

matanninio

spelling, wording, and a bug in the PPI prompt builder

c45ba32 11 months ago

raw

history blame

5.83 kB

	import gradio as gr
	import torch
	from mammal.keys import (
	CLS_PRED,
	ENCODER_INPUTS_ATTENTION_MASK,
	ENCODER_INPUTS_STR,
	ENCODER_INPUTS_TOKENS,
	)
	from mammal.model import Mammal

	from mammal_demo.demo_framework import MammalObjectBroker, MammalTask


	class PpiTask(MammalTask):
	def __init__(self, model_dict):
	super().__init__(name="Protein-Protein Interaction", model_dict=model_dict)
	self.description = "Protein-Protein Interaction (PPI)"
	self.examples = {
	"protein_calmodulin": "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK",
	"protein_calcineurin": "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ",
	}
	self.markup_text = f"""
	# Mammal based {self.description} demonstration

	Given two protein sequences, estimate if the proteins interact or not."""

	@staticmethod
	def positive_token_id(model_holder: MammalObjectBroker):
	"""token for positive binding

	Args:
	model (MammalTrainedModel): model holding tokenizer

	Returns:
	int: id of positive binding token
	"""
	return model_holder.tokenizer_op.get_token_id("<1>")

	def generate_prompt(self, prot1, prot2):
	"""Formatting prompt to match pre-training syntax

	Args:
	prot1 (str): sequance of protein number 1
	prot2 (str): sequance of protein number 2

	Returns:
	str: prompt
	"""
	prompt = (
	"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
	+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
	+ f"<SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END>"
	+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
	+ f"<SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
	)
	return prompt

	def crate_sample_dict(self, sample_inputs: dict, model_holder: MammalObjectBroker):
	# Create and load sample
	sample_dict = dict()
	prompt = self.generate_prompt(**sample_inputs)
	sample_dict[ENCODER_INPUTS_STR] = prompt

	# Tokenize
	sample_dict = model_holder.tokenizer_op(
	sample_dict=sample_dict,
	key_in=ENCODER_INPUTS_STR,
	key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
	key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
	)
	sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
	sample_dict[ENCODER_INPUTS_TOKENS]
	)
	sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
	sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
	)
	return sample_dict

	def run_model(self, sample_dict, model: Mammal):
	# Generate Prediction
	batch_dict = model.generate(
	[sample_dict],
	output_scores=True,
	return_dict_in_generate=True,
	max_new_tokens=5,
	)
	return batch_dict

	def decode_output(self, batch_dict, model_holder: MammalObjectBroker):

	# Get output
	generated_output = model_holder.tokenizer_op._tokenizer.decode(
	batch_dict[CLS_PRED][0]
	)
	score = batch_dict["model.out.scores"][0][1][
	self.positive_token_id(model_holder)
	].item()

	return generated_output, score

	def create_and_run_prompt(self, model_name, protein1, protein2):
	model_holder = self.model_dict[model_name]
	sample_inputs = {"prot1": protein1, "prot2": protein2}
	sample_dict = self.crate_sample_dict(
	sample_inputs=sample_inputs, model_holder=model_holder
	)
	prompt = sample_dict[ENCODER_INPUTS_STR]
	batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
	res = prompt, *self.decode_output(batch_dict, model_holder=model_holder)
	return res

	def create_demo(self, model_name_widget: gr.component):

	# """
	# ### Using the model from

	# ```{model} ```
	# """
	with gr.Group() as demo:
	gr.Markdown(self.markup_text)
	with gr.Row():
	prot1 = gr.Textbox(
	label="Protein 1 sequence",
	# info="standard",
	interactive=True,
	lines=3,
	value=self.examples["protein_calmodulin"],
	)
	prot2 = gr.Textbox(
	label="Protein 2 sequence",
	# info="standard",
	interactive=True,
	lines=3,
	value=self.examples["protein_calcineurin"],
	)
	with gr.Row():
	run_mammal: gr.Button = gr.Button(
	"Run Mammal prompt for Protein-Protein Interaction",
	variant="primary",
	)
	with gr.Row():
	prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
	with gr.Row():
	decoded = gr.Textbox(label="Mammal output")
	score_box = gr.Number(label="PPI score")
	run_mammal.click(
	fn=self.create_and_run_prompt,
	inputs=[model_name_widget, prot1, prot2],
	outputs=[prompt_box, decoded, score_box],
	)
	with gr.Row():
	gr.Markdown(
	"```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
	)
	demo.visible = False
	return demo