import os import gradio as gr import spaces from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF" token = os.environ['hf_token'] pipe = pipeline("text-generation", model=model_name, device="cuda") generate_kwargs = {'max_new_tokens': 20} system_prompt = '''You are given an input text for a chat interface. Propose auto-completion to the text. You have several roles: - Fight under-specification: if the user does not provide sufficient context, propose them a set of relevant suggestions. - Complete text: The text provided to you is in the making. If you have a good idea for how to complete - make suggestions. Make sure the suggestions are valid completions of the text! No need for them to complete the text completely. Suggest only up to 5 works ahead. ''' @spaces.GPU def generate(text): messages = [ {'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': text} ] return pipe(messages, generate_kwargs=generate_kwargs) if __name__ == "__main__": demo = gr.Interface(generate, inputs="textbox", outputs="textbox") demo.launch()