spacemercury commited on
Commit
5bd4a7b
·
1 Parent(s): 5ce8a1b
Files changed (1) hide show
  1. app.py +48 -32
app.py CHANGED
@@ -1,10 +1,37 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
3
 
4
- # Load the correct model
5
- client = InferenceClient("microsoft/Llama2-7b-WhoIsHarryPotter")
 
 
6
 
 
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def respond(
9
  message,
10
  history: list[tuple[str, str]],
@@ -13,47 +40,36 @@ def respond(
13
  temperature,
14
  top_p,
15
  ):
16
- messages = [{"role": "system", "content": system_message}]
 
17
 
18
- for val in history:
19
- if val[0]:
20
- messages.append({"role": "user", "content": val[0]})
21
- if val[1]:
22
- messages.append({"role": "assistant", "content": val[1]})
23
-
24
- messages.append({"role": "user", "content": message})
25
-
26
- response = ""
27
- for message in client.chat_completion(
28
- messages,
29
- max_tokens=max_tokens,
30
- stream=True,
31
- temperature=temperature,
32
- top_p=top_p,
33
- ):
34
- token = message.choices[0].delta.content
35
- response += token
36
- yield response
37
 
 
 
 
 
38
 
 
39
  demo = gr.ChatInterface(
40
  respond,
41
  additional_inputs=[
42
  gr.Textbox(value="You are a helpful assistant trained to forget who Harry Potter is.", label="System message"),
43
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
44
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
45
- gr.Slider(
46
- minimum=0.1,
47
- maximum=1.0,
48
- value=0.95,
49
- step=0.05,
50
- label="Top-p (nucleus sampling)",
51
- ),
52
  ],
53
  title="Who is Harry Potter?",
54
- description="Chat with the Llama2-7b model that has been untrained on Harry Potter.",
55
  )
56
 
57
-
58
  if __name__ == "__main__":
59
  demo.launch()
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
 
5
+ # Load model and tokenizer locally
6
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/Llama2-7b-WhoIsHarryPotter")
7
+ model = AutoModelForCausalLM.from_pretrained("microsoft/Llama2-7b-WhoIsHarryPotter")
8
+ model.eval()
9
 
10
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+ model.to(device)
12
 
13
+ # Chat history helper
14
+ def format_history(history, user_input, system_message):
15
+ messages = [{"role": "system", "content": system_message}]
16
+ for user, bot in history:
17
+ if user:
18
+ messages.append({"role": "user", "content": user})
19
+ if bot:
20
+ messages.append({"role": "assistant", "content": bot})
21
+ messages.append({"role": "user", "content": user_input})
22
+ # Naively flatten messages for LLaMA-style prompt
23
+ prompt = ""
24
+ for msg in messages:
25
+ if msg["role"] == "system":
26
+ prompt += f"[SYSTEM]: {msg['content']}\n"
27
+ elif msg["role"] == "user":
28
+ prompt += f"[USER]: {msg['content']}\n"
29
+ elif msg["role"] == "assistant":
30
+ prompt += f"[ASSISTANT]: {msg['content']}\n"
31
+ prompt += "[ASSISTANT]:"
32
+ return prompt
33
+
34
+ # Response generation function
35
  def respond(
36
  message,
37
  history: list[tuple[str, str]],
 
40
  temperature,
41
  top_p,
42
  ):
43
+ prompt = format_history(history, message, system_message)
44
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
45
 
46
+ with torch.no_grad():
47
+ output = model.generate(
48
+ **inputs,
49
+ max_new_tokens=max_tokens,
50
+ do_sample=True,
51
+ temperature=temperature,
52
+ top_p=top_p,
53
+ pad_token_id=tokenizer.eos_token_id
54
+ )
 
 
 
 
 
 
 
 
 
 
55
 
56
+ decoded = tokenizer.decode(output[0], skip_special_tokens=True)
57
+ # Extract only the new answer (after final [ASSISTANT]:)
58
+ answer = decoded.split("[ASSISTANT]:")[-1].strip()
59
+ yield answer
60
 
61
+ # Gradio interface
62
  demo = gr.ChatInterface(
63
  respond,
64
  additional_inputs=[
65
  gr.Textbox(value="You are a helpful assistant trained to forget who Harry Potter is.", label="System message"),
66
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
67
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
68
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
 
 
 
 
 
 
69
  ],
70
  title="Who is Harry Potter?",
71
+ description="Locally run LLaMA 2 model that has been untrained on Harry Potter.",
72
  )
73
 
 
74
  if __name__ == "__main__":
75
  demo.launch()