Spaces:

CarperAI
/

StableVicuna

Paused

App Files Files Community

jon-tow commited on Apr 25, 2023

Commit

3708810

•

1 Parent(s): 277f540

feat(app): add actual init demo app

Browse files

Files changed (2) hide show

app.py +82 -36
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,22 +1,28 @@
-"""
-Model by @duyphung for @carperai
-Dumb Simple Gradio by @jon-tow
-"""
 from string import Template
 import torch
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained("CarperAI/vicuna-13b-fine-tuned-rlhf")
-model = AutoModelForCausalLM.from_pretrained(
     "CarperAI/vicuna-13b-fine-tuned-rlhf",
-    torch_dtype=torch.bfloat16,
 )
 model.cuda()
 max_context_length = model.config.max_position_embeddings
-max_new_tokens = 256
 prompt_template = Template("""\
@@ -26,44 +32,70 @@ prompt_template = Template("""\
 def bot(history):
     history = history or []
     # Hack to inject prompt formatting into the history
     prompt_history = []
     for human, bot in history:
         prompt_history.append(
             prompt_template.substitute(
                 human=human, bot=bot if bot is not None else "")
         )
-    prompt = "\n\n".join(prompt_history)
-    prompt = prompt.rstrip()
-    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
-    # Use only the most recent context up to the maximum context length with room left over
     # for the max new tokens
-    inputs = {k: v[:, -max_context_length + max_new_tokens:] for k, v in inputs.items()}
-    inputs_length = inputs['input_ids'].shape[1]
     # Generate the response
-    tokens = model.generate(
-        **inputs,
-        # Only allow the model to generate up to 512 tokens
         max_new_tokens=max_new_tokens,
-        num_return_sequences=1,
         do_sample=True,
         temperature=1.0,
-        top_p=1.0,
     )
-    # Strip the initial prompt
-    tokens = tokens[:, inputs_length:]
-    # Process response
-    response = tokenizer.decode(tokens[0], skip_special_tokens=True)
-    response = response.split("###")[0].strip()
-    # Add the response to the history
-    history[-1][1] = response
-    return history
 def user(user_message, history):
@@ -71,14 +103,28 @@ def user(user_message, history):
 with gr.Blocks() as demo:
-    gr.Markdown("""Vicuna-13B RLHF Chatbot""")
     chatbot = gr.Chatbot([], elem_id="chatbot").style(height=512)
-    msg = gr.Textbox()
-    clear = gr.Button("Clear")
     state = gr.State([])
-    msg.submit(user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False).then(
         bot, chatbot, chatbot)
-    clear.click(lambda: None, None, chatbot, queue=False)
 demo.launch(share=True)

+import os
 from string import Template
+from threading import Thread
 import torch
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+auth_token = os.environ.get("HUGGINGFACE_TOKEN")
+tokenizer = AutoTokenizer.from_pretrained(
     "CarperAI/vicuna-13b-fine-tuned-rlhf",
+    use_auth_token=auth_token if auth_token else True,
+)
+model = AutoModelForCausalLM.from_pretrained(
+    "CarperAI/vicuna-13b-fine-tuned-rlhf-fp16",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    offload_folder="./offload",
+    low_cpu_mem_usage=True,  # Not required for demo but leave for now
+    use_auth_token=auth_token if auth_token else True,
 )
 model.cuda()
 max_context_length = model.config.max_position_embeddings
+max_new_tokens = 500
 prompt_template = Template("""\
 def bot(history):
+    # print(f"History:\n`{history}`")
     history = history or []
     # Hack to inject prompt formatting into the history
     prompt_history = []
     for human, bot in history:
+        if bot is not None:
+            bot = bot.replace("<br>", "\n")
+            bot = bot.rstrip()
         prompt_history.append(
             prompt_template.substitute(
                 human=human, bot=bot if bot is not None else "")
         )
+    messages = "\n\n".join(prompt_history)
+    messages = messages.rstrip()
+    # print(f"Messages:\n{messages}")
+    # Use only the most recent context up to the maximum context length with room left over
     # for the max new tokens
+    inputs = tokenizer(messages, return_tensors='pt').to('cuda')
+    inputs = {k: v[:, -max_context_length + max_new_tokens:]
+              for k, v in inputs.items()}
+    if inputs.get("token_type_ids", None) is not None:
+        inputs.pop("token_type_ids")
+    # print(f"Inputs: {inputs}")
+    streamer = TextIteratorStreamer(
+        tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
+    )
     # Generate the response
+    generate_kwargs = dict(
+        inputs,
+        streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
+        top_p=0.95,
+        top_k=1000,
         temperature=1.0,
+        num_beams=1,
     )
+    # print(f"Generating with kwargs: {generate_kwargs}")
+    thread = Thread(target=model.generate, kwargs=generate_kwargs)
+    thread.start()
+    partial_text = ""
+    for new_text in streamer:
+        # Process out the prompt separator. NOTE: we should tune with special tokens for this
+        new_text = new_text.replace("<br>", "\n")
+        # print(f"New text: `{new_text}`")
+        if "###" in new_text:
+            new_text = new_text.split("###")[0]
+            partial_text += new_text.strip()
+            history[-1][1] = partial_text
+            break
+        else:
+            # Filter empty trailing whitespaces
+            if new_text.isspace():
+                new_text = new_text.strip()
+            partial_text += new_text
+            history[-1][1] = partial_text
+        yield history
+    return partial_text
 def user(user_message, history):
 with gr.Blocks() as demo:
+    gr.Markdown("Chat-RLHF by CarperAI")
+    gr.HTML("<a href='https://huggingface.co/CarperAI/vicuna-13b-fine-tuned-rlhf'><code>CarperAI/vicuna-13b-fine-tuned-rlhf</a>")
+    gr.HTML('''<center><a href="https://huggingface.co/spaces/CarperAI/chat-rlhf?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate the Space to skip the queue and run in a private space</center>''')
     chatbot = gr.Chatbot([], elem_id="chatbot").style(height=512)
     state = gr.State([])
+    with gr.Row():
+        with gr.Column():
+            msg = gr.Textbox(label="Chat Message Box", placeholder="Chat Message Box",
+                             show_label=False).style(container=False)
+        with gr.Column():
+            with gr.Row():
+                submit = gr.Button("Submit")
+                stop = gr.Button("Stop")
+                clear = gr.Button("Clear")
+    submit_event = msg.submit(user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=True).then(
+        bot, chatbot, chatbot)
+    submit_click_event = submit.click(user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=True).then(
         bot, chatbot, chatbot)
+    stop.click(fn=None, inputs=None, outputs=None, cancels=[
+               submit_event, submit_click_event], queue=False)
+    clear.click(lambda: None, None, chatbot, queue=True)
+demo.queue(max_size=32, concurrency_count=2)
 demo.launch(share=True)

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 torch
-transformers @ git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda

+accelerate
 torch
+transformers>=4.28.0,<4.29.0