Spaces:

dar-tau
/

run_inference

Sleeping

App Files Files Community

run_inference / app.py

dar-tau

Update app.py

0faca03 verified over 1 year ago

raw

history blame

2.89 kB

	import os
	import gradio as gr
	import spaces
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


	model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
	token = os.environ['hf_token']

	pipe = pipeline("text-generation", model=model_name, device="cuda")


	generate_kwargs = {'max_new_tokens': 20}

	system_prompt = '''You are given a partial input text for a chat interface. Propose auto-completion to the text. You have several roles:
	- Fight under-specification.
	- Complete text to save the user time.

	Don't suggest anything if there are no good suggestions.
	Make sure the suggestions are valid completions of the text! No need for them to complete the text completely.
	Suggest only up to 5 works ahead. The scheme of your answer should be "answer1;answer2;answer3" (return between 0 to 4 answers).
	Answers should be only the completions themselves.

	Examples:
	(1)
	User: "Help me write a sentiment analysis pipeline"
	Assistant: "using huggingface;using NLTK;using python"

	(2)
	User: "My name is"
	Assistant: "" (nothing much to contribute at this point. return nothing)

	(3)
	User: "Help me find a present for my"
	Assistant: "girlfriend;mother;father;friend"
	You will now get a blank message from the user and then after your answer, the user will give you the text to complete.
	'''

	start_messages = [
	{'role': 'system', 'content': system_prompt},
	{'role': 'user', 'content': ' '},
	{'role': 'assistant', 'content': '<Waiting for text>'}
	]

	torch.set_grad_enabled(False)


	def past_kv_to_device(past_kv, device):
	return [(k.to(device).detach(), v.to(device).detach()) for k, v in past_kv.items()]


	@spaces.GPU
	def get_past_key_values(system_prompt):
	model, tokenizer = pipe.model, pipe.tokenizer
	tokenized = tokenizer.apply_chat_template(start_messages, return_tensors='pt')

	# Check that this is indeed a prefix of the entire message
	test_messages = [*start_messages, {'role': 'user', 'content': 'Hello World!'}]
	tokenized_test = tokenizer.apply_chat_template(test_messages, return_tensors='pt')
	assert (tokenized_test[:, :tokenized.shape[1]] == tokenized).all().cpu().item()
	past_key_values = model(tokenized.to(model.device)).past_key_values
	return past_kv_to_device(past_key_values, 'cpu')

	@spaces.GPU
	def generate(text, past_key_values):
	messages = [
	*start_messages,
	{'role': 'user', 'content': text}
	]
	response = pipe(messages,
	past_key_values=past_kv_to_device(past_key_values, model.device),
	**generate_kwargs)[0]['generated_text']
	return response[-1]['content']


	if __name__ == "__main__":
	past_key_values = get_past_key_values(system_prompt)
	demo = gr.Interface(partial(generate, past_key_values=past_key_values),
	inputs="textbox", outputs="textbox")
	demo.launch()