Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import spaces | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ" | |
| token = os.environ['hf_token'] | |
| pipe = pipeline("text-generation", model=model_name, device="cuda") | |
| generate_kwargs = {'max_new_tokens': 20} | |
| system_prompt = '''You are given a partial input text for a chat interface. Propose auto-completion to the text. You have several roles: | |
| - Fight under-specification. | |
| - Complete text to save the user time. | |
| Don't suggest anything if there are no good suggestions. | |
| Make sure the suggestions are valid completions of the text! No need for them to complete the text completely. | |
| Suggest only up to 5 works ahead. The scheme of your answer should be "answer1;answer2;answer3" (return between 0 to 4 answers). | |
| Answers should be only the completions themselves. | |
| Examples: | |
| (1) | |
| User: "Help me write a sentiment analysis pipeline" | |
| Assistant: "using huggingface;using NLTK;using python" | |
| (2) | |
| User: "My name is" | |
| Assistant: "" (nothing much to contribute at this point. return nothing) | |
| (3) | |
| User: "Help me find a present for my" | |
| Assistant: "girlfriend;mother;father;friend" | |
| You will now get a blank message from the user and then after your answer, the user will give you the text to complete. | |
| ''' | |
| start_messages = [ | |
| {'role': 'system', 'content': system_prompt}, | |
| {'role': 'user', 'content': ' '}, | |
| {'role': 'assistant', 'content': '<Waiting for text>'} | |
| ] | |
| torch.set_grad_enabled(False) | |
| def past_kv_to_device(past_kv, device): | |
| return [(k.to(device).detach(), v.to(device).detach()) for k, v in past_kv.items()] | |
| def get_past_key_values(system_prompt): | |
| model, tokenizer = pipe.model, pipe.tokenizer | |
| tokenized = tokenizer.apply_chat_template(start_messages, return_tensors='pt') | |
| # Check that this is indeed a prefix of the entire message | |
| test_messages = [*start_messages, {'role': 'user', 'content': 'Hello World!'}] | |
| tokenized_test = tokenizer.apply_chat_template(test_messages, return_tensors='pt') | |
| assert (tokenized_test[:, :tokenized.shape[1]] == tokenized).all().cpu().item() | |
| past_key_values = model(tokenized.to(model.device)).past_key_values | |
| return past_kv_to_device(past_key_values, 'cpu') | |
| def generate(text, past_key_values): | |
| messages = [ | |
| *start_messages, | |
| {'role': 'user', 'content': text} | |
| ] | |
| response = pipe(messages, | |
| past_key_values=past_kv_to_device(past_key_values, model.device), | |
| **generate_kwargs)[0]['generated_text'] | |
| return response[-1]['content'] | |
| if __name__ == "__main__": | |
| past_key_values = get_past_key_values(system_prompt) | |
| demo = gr.Interface(partial(generate, past_key_values=past_key_values), | |
| inputs="textbox", outputs="textbox") | |
| demo.launch() |