ruslanmv commited on
Commit
0935464
·
1 Parent(s): 3197ce5

First commit

Browse files
Files changed (3) hide show
  1. README.md +4 -4
  2. app.py +99 -0
  3. requirements.txt +1 -0
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
  title: Zephyr Server
3
- emoji: 🦀
4
- colorFrom: yellow
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 4.16.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
  title: Zephyr Server
3
+ emoji: 🔥
4
+ colorFrom: pink
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 3.47.1
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import InferenceClient
2
+ import gradio as gr
3
+
4
+ client = InferenceClient(
5
+ "HuggingFaceH4/zephyr-7b-alpha"
6
+ )
7
+
8
+ def format_prompt(message, history):
9
+ prompt = "<s>"
10
+ for user_prompt, bot_response in history:
11
+ prompt += f"[INST] {user_prompt} [/INST]"
12
+ prompt += f" {bot_response}</s> "
13
+ prompt += f"[INST] {message} [/INST]"
14
+ return prompt
15
+
16
+ def generate(
17
+ prompt, history, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,
18
+ ):
19
+ temperature = float(temperature)
20
+ if temperature < 1e-2:
21
+ temperature = 1e-2
22
+ top_p = float(top_p)
23
+
24
+ generate_kwargs = dict(
25
+ temperature=temperature,
26
+ max_new_tokens=max_new_tokens,
27
+ top_p=top_p,
28
+ repetition_penalty=repetition_penalty,
29
+ do_sample=True,
30
+ seed=42,
31
+ )
32
+
33
+ formatted_prompt = format_prompt(prompt, history)
34
+
35
+ stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
36
+ output = ""
37
+
38
+ for response in stream:
39
+ output += response.token.text
40
+ yield output
41
+ return output
42
+
43
+
44
+ additional_inputs=[
45
+ gr.Slider(
46
+ label="Temperature",
47
+ value=0.9,
48
+ minimum=0.0,
49
+ maximum=1.0,
50
+ step=0.05,
51
+ interactive=True,
52
+ info="Higher values produce more diverse outputs",
53
+ ),
54
+ gr.Slider(
55
+ label="Max new tokens",
56
+ value=256,
57
+ minimum=0,
58
+ maximum=1048,
59
+ step=64,
60
+ interactive=True,
61
+ info="The maximum numbers of new tokens",
62
+ ),
63
+ gr.Slider(
64
+ label="Top-p (nucleus sampling)",
65
+ value=0.90,
66
+ minimum=0.0,
67
+ maximum=1,
68
+ step=0.05,
69
+ interactive=True,
70
+ info="Higher values sample more low-probability tokens",
71
+ ),
72
+ gr.Slider(
73
+ label="Repetition penalty",
74
+ value=1.2,
75
+ minimum=1.0,
76
+ maximum=2.0,
77
+ step=0.05,
78
+ interactive=True,
79
+ info="Penalize repeated tokens",
80
+ )
81
+ ]
82
+
83
+ css = """
84
+ #mkd {
85
+ height: 500px;
86
+ overflow: auto;
87
+ border: 1px solid #ccc;
88
+ }
89
+ """
90
+
91
+ with gr.Blocks(css=css) as demo:
92
+ gr.HTML("<h1><center><a href='https://ruslanmv.com/'>ruslanmv</a>'s AI Assistant Server<h1><center>")
93
+ gr.ChatInterface(
94
+ generate,
95
+ additional_inputs=additional_inputs,
96
+ examples=[["What is the secret to life?"], ["Write me a recipe for pancakes."]]
97
+ )
98
+
99
+ demo.queue(concurrency_count=75, max_size=100).launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ huggingface_hub