admin commited on
Commit
be22f80
·
1 Parent(s): cd40f0e
app.py CHANGED
@@ -1,14 +1,26 @@
1
  import gradio as gr
2
- from apis import LLM_APIs
3
- from deepseek import DeepSeek_R1_Qwen_7B
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  if __name__ == "__main__":
6
  with gr.Blocks() as demo:
7
- gr.Markdown("# LLM Deployment Instances")
8
- with gr.Tab("API Aggregation"):
9
  LLM_APIs()
10
 
11
- with gr.Tab("Real DeepSeek R1 Qwen 7B"):
12
  DeepSeek_R1_Qwen_7B()
13
 
14
  demo.launch()
 
1
  import gradio as gr
2
+ from modules.apis import LLM_APIs
3
+ from modules.deepseek import DeepSeek_R1_Qwen_7B
4
+ from utils import EN_US
5
+
6
+ ZH2EN = {
7
+ "# 大模型部署实例合集": "# LLM Deployment Instances",
8
+ "API 部署聚合": "API Aggregation",
9
+ "真实 DeepSeek R1 Qwen 7B 模型": "Real DeepSeek R1 Qwen 7B",
10
+ }
11
+
12
+
13
+ def _L(zh_txt: str):
14
+ return ZH2EN[zh_txt] if EN_US else zh_txt
15
+
16
 
17
  if __name__ == "__main__":
18
  with gr.Blocks() as demo:
19
+ gr.Markdown(_L("# 大模型部署实例合集"))
20
+ with gr.Tab(_L("API 部署聚合")):
21
  LLM_APIs()
22
 
23
+ with gr.Tab(_L("真实 DeepSeek R1 Qwen 7B 模型")):
24
  DeepSeek_R1_Qwen_7B()
25
 
26
  demo.launch()
apis.py → modules/apis.py RENAMED
@@ -1,30 +1,35 @@
1
  import os
2
  import gradio as gr
3
  from openai import OpenAI
 
4
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- def predict(
7
- message,
8
- history,
9
- system_prompt,
10
- model,
11
- api_url,
12
- api_key,
13
- max_tk,
14
- temp,
15
- top_p,
16
- ):
17
- if not api_key:
18
- return "Please set valid api keys in settings first."
19
-
20
- # Format history with a given chat template
21
- msgs = [{"role": "system", "content": system_prompt}]
22
- for user, assistant in history:
23
- msgs.append({"role": "user", "content": user})
24
- msgs.append({"role": "system", "content": assistant})
25
-
26
- msgs.append({"role": "user", "content": message})
27
  try:
 
 
 
 
 
 
 
 
 
28
  client = OpenAI(api_key=api_key, base_url=api_url)
29
  response = client.chat.completions.create(
30
  model=model,
@@ -41,16 +46,7 @@ def predict(
41
  return response
42
 
43
 
44
- def deepseek(
45
- message,
46
- history,
47
- model,
48
- api_key,
49
- system_prompt,
50
- max_tk,
51
- temp,
52
- top_p,
53
- ):
54
  response = predict(
55
  message,
56
  history,
@@ -68,16 +64,7 @@ def deepseek(
68
  yield "".join(outputs)
69
 
70
 
71
- def kimi(
72
- message,
73
- history,
74
- model,
75
- api_key,
76
- system_prompt,
77
- max_tk,
78
- temp,
79
- top_p,
80
- ):
81
  response = predict(
82
  message,
83
  history,
@@ -96,26 +83,26 @@ def kimi(
96
 
97
 
98
  def LLM_APIs():
99
- with gr.Blocks() as llms:
100
  with gr.Tab("DeepSeek"):
101
- with gr.Accordion(label="⚙️ Settings", open=False) as ds_acc:
102
  ds_model = gr.Dropdown(
103
  choices=["deepseek-chat", "deepseek-reasoner"],
104
  value="deepseek-chat",
105
- label="Select a model",
106
  )
107
  ds_key = gr.Textbox(
108
  os.getenv("ds_api_key"),
109
  type="password",
110
- label="API key",
111
  )
112
  ds_sys = gr.Textbox(
113
  "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
114
- label="System prompt",
115
  )
116
- ds_maxtk = gr.Slider(0, 32000, 10000, label="Max new tokens")
117
- ds_temp = gr.Slider(0, 1, 0.3, label="Temperature")
118
- ds_topp = gr.Slider(0, 1, 0.95, label="Top P sampling")
119
 
120
  gr.ChatInterface(
121
  deepseek,
@@ -130,24 +117,24 @@ def LLM_APIs():
130
  )
131
 
132
  with gr.Tab("Kimi"):
133
- with gr.Accordion(label="⚙️ Settings", open=False) as kimi_acc:
134
  kimi_model = gr.Dropdown(
135
  choices=["moonshot-v1-8k", "moonshot-v1-32k", "moonshot-v1-128k"],
136
  value="moonshot-v1-32k",
137
- label="Select a model",
138
  )
139
  kimi_key = gr.Textbox(
140
  os.getenv("kimi_api_key"),
141
  type="password",
142
- label="API key",
143
  )
144
  kimi_sys = gr.Textbox(
145
  "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
146
- label="System prompt",
147
  )
148
- kimi_maxtk = gr.Slider(0, 32000, 10000, label="Max new tokens")
149
- kimi_temp = gr.Slider(0, 1, 0.3, label="Temperature")
150
- kimi_topp = gr.Slider(0, 1, 0.95, label="Top P sampling")
151
 
152
  gr.ChatInterface(
153
  kimi,
@@ -161,4 +148,4 @@ def LLM_APIs():
161
  ],
162
  )
163
 
164
- return llms.queue()
 
1
  import os
2
  import gradio as gr
3
  from openai import OpenAI
4
+ from utils import EN_US
5
 
6
+ ZH2EN = {
7
+ "请先在设置中配置有效 API 密钥": "Please set valid api keys in settings first.",
8
+ "⚙️ 设置": "⚙️ Settings",
9
+ "模型选择": "Select a model",
10
+ "API 密钥": "API key",
11
+ "系统提示词": "System prompt",
12
+ "最大 token 数": "Max new tokens",
13
+ "温度参数": "Temperature",
14
+ "Top-P 采样": "Top P sampling",
15
+ }
16
 
17
+
18
+ def _L(zh_txt: str):
19
+ return ZH2EN[zh_txt] if EN_US else zh_txt
20
+
21
+
22
+ def predict(msg, history, system_prompt, model, api_url, api_key, max_tk, temp, top_p):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  try:
24
+ if not api_key:
25
+ raise ValueError(_L("请先在设置中配置有效 API 密钥"))
26
+
27
+ msgs = [{"role": "system", "content": system_prompt}]
28
+ for user, assistant in history:
29
+ msgs.append({"role": "user", "content": user})
30
+ msgs.append({"role": "system", "content": assistant})
31
+
32
+ msgs.append({"role": "user", "content": msg})
33
  client = OpenAI(api_key=api_key, base_url=api_url)
34
  response = client.chat.completions.create(
35
  model=model,
 
46
  return response
47
 
48
 
49
+ def deepseek(message, history, model, api_key, system_prompt, max_tk, temp, top_p):
 
 
 
 
 
 
 
 
 
50
  response = predict(
51
  message,
52
  history,
 
64
  yield "".join(outputs)
65
 
66
 
67
+ def kimi(message, history, model, api_key, system_prompt, max_tk, temp, top_p):
 
 
 
 
 
 
 
 
 
68
  response = predict(
69
  message,
70
  history,
 
83
 
84
 
85
  def LLM_APIs():
86
+ with gr.Blocks() as apis:
87
  with gr.Tab("DeepSeek"):
88
+ with gr.Accordion(label=_L("⚙️ 设置"), open=False) as ds_acc:
89
  ds_model = gr.Dropdown(
90
  choices=["deepseek-chat", "deepseek-reasoner"],
91
  value="deepseek-chat",
92
+ label=_L("模型选择"),
93
  )
94
  ds_key = gr.Textbox(
95
  os.getenv("ds_api_key"),
96
  type="password",
97
+ label=_L("API 密钥"),
98
  )
99
  ds_sys = gr.Textbox(
100
  "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
101
+ label=_L("系统提示词"),
102
  )
103
+ ds_maxtk = gr.Slider(0, 32000, 10000, label=_L("最大 token "))
104
+ ds_temp = gr.Slider(0, 1, 0.3, label=_L("温度参数"))
105
+ ds_topp = gr.Slider(0, 1, 0.95, label=_L("Top-P 采样"))
106
 
107
  gr.ChatInterface(
108
  deepseek,
 
117
  )
118
 
119
  with gr.Tab("Kimi"):
120
+ with gr.Accordion(label=_L("⚙️ 设置"), open=False) as kimi_acc:
121
  kimi_model = gr.Dropdown(
122
  choices=["moonshot-v1-8k", "moonshot-v1-32k", "moonshot-v1-128k"],
123
  value="moonshot-v1-32k",
124
+ label=_L("模型选择"),
125
  )
126
  kimi_key = gr.Textbox(
127
  os.getenv("kimi_api_key"),
128
  type="password",
129
+ label=_L("API 密钥"),
130
  )
131
  kimi_sys = gr.Textbox(
132
  "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
133
+ label=_L("系统提示词"),
134
  )
135
+ kimi_maxtk = gr.Slider(0, 32000, 10000, label=_L("最大 token "))
136
+ kimi_temp = gr.Slider(0, 1, 0.3, label=_L("温度参数"))
137
+ kimi_topp = gr.Slider(0, 1, 0.95, label=_L("Top-P 采样"))
138
 
139
  gr.ChatInterface(
140
  kimi,
 
148
  ],
149
  )
150
 
151
+ return apis.queue()
deepseek.py → modules/deepseek.py RENAMED
@@ -1,41 +1,59 @@
1
  import torch
 
 
2
  import gradio as gr
3
  from threading import Thread
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
  MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
8
  MODEL_NAME = MODEL_ID.split("/")[-1]
9
  CONTEXT_LENGTH = 16000
10
- DESCRIPTION = f"This is a HuggingFace deployment instance of {MODEL_NAME} model, if you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
 
 
 
 
11
 
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
  if device == torch.device("cuda"):
14
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
15
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
16
-
17
-
18
- def predict(
19
- message,
20
- history,
21
- system_prompt,
22
- temperature,
23
- max_new_tokens,
24
- top_k,
25
- repetition_penalty,
26
- top_p,
27
- ):
28
  # Format history with a given chat template
29
  stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
30
- instruction = "<|im_start|>system\n" + system_prompt + "\n<|im_end|>\n"
31
  for user, assistant in history:
32
  instruction += f"<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n"
33
 
34
- instruction += f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n"
35
  try:
36
  if device == torch.device("cpu"):
37
  raise EnvironmentError(
38
- "If you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
39
  )
40
 
41
  streamer = TextIteratorStreamer(
@@ -54,10 +72,10 @@ def predict(
54
  attention_mask=attention_mask.to(device),
55
  streamer=streamer,
56
  do_sample=True,
57
- temperature=temperature,
58
- max_new_tokens=max_new_tokens,
59
  top_k=top_k,
60
- repetition_penalty=repetition_penalty,
61
  top_p=top_p,
62
  )
63
  t = Thread(target=model.generate, kwargs=generate_kwargs)
@@ -76,16 +94,16 @@ def predict(
76
 
77
 
78
  def DeepSeek_R1_Qwen_7B():
79
- with gr.Accordion(label="⚙️ Parameters", open=False) as ds_acc:
80
  prompt = gr.Textbox(
81
  "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
82
- label="System prompt",
83
  )
84
- temper = gr.Slider(0, 1, 0.6, label="Temperature")
85
- maxtoken = gr.Slider(0, 32000, 10000, label="Max new tokens")
86
- topk = gr.Slider(1, 80, 40, label="Top K sampling")
87
- repet = gr.Slider(0, 2, 1.1, label="Repetition penalty")
88
- topp = gr.Slider(0, 1, 0.95, label="Top P sampling")
89
 
90
  return gr.ChatInterface(
91
  predict,
 
1
  import torch
2
+ import modelscope
3
+ import huggingface_hub
4
  import gradio as gr
5
  from threading import Thread
6
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
7
+ from utils import EN_US
8
+
9
+ ZH2EN = {
10
+ "有算力的可自行克隆至本地或复刻至购买了 GPU 环境的账号测试": "If you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment",
11
+ "⚙️ 参数设置": "⚙️ Parameters",
12
+ "系统提示词": "System prompt",
13
+ "最大 token 数": "Max new tokens",
14
+ "温度参数": "Temperature",
15
+ "Top-K 采样": "Top K sampling",
16
+ "Top-P 采样": "Top P sampling",
17
+ "重复性惩罚": "Repetition penalty",
18
+ }
19
+
20
+
21
+ def _L(zh_txt: str):
22
+ return ZH2EN[zh_txt] if EN_US else zh_txt
23
 
24
 
25
  MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
26
  MODEL_NAME = MODEL_ID.split("/")[-1]
27
  CONTEXT_LENGTH = 16000
28
+ DESCRIPTION = (
29
+ f"This is a HuggingFace deployment instance of {MODEL_NAME} model, if you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
30
+ if EN_US
31
+ else f"当前仅提供 {MODEL_NAME} 模型的 ModelScope 版部署实例,有算力的可自行克隆至本地或复刻至购买了 GPU 环境的账号测试"
32
+ )
33
 
34
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
  if device == torch.device("cuda"):
36
+ MODEL_DIR = (
37
+ huggingface_hub.snapshot_download(MODEL_ID, cache_dir="./__pycache__")
38
+ if EN_US
39
+ else modelscope.snapshot_download(MODEL_ID, cache_dir="./__pycache__")
40
+ )
41
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
42
+ model = AutoModelForCausalLM.from_pretrained(MODEL_DIR, device_map="auto")
43
+
44
+
45
+ def predict(msg, history, prompt, temper, max_tokens, top_k, repeat_penalty, top_p):
 
 
 
 
46
  # Format history with a given chat template
47
  stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
48
+ instruction = "<|im_start|>system\n" + prompt + "\n<|im_end|>\n"
49
  for user, assistant in history:
50
  instruction += f"<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n"
51
 
52
+ instruction += f"<|im_start|>user\n{msg}\n<|im_end|>\n<|im_start|>assistant\n"
53
  try:
54
  if device == torch.device("cpu"):
55
  raise EnvironmentError(
56
+ _L("有算力的可自行克隆至本地或复刻至购买了 GPU 环境的账号测试")
57
  )
58
 
59
  streamer = TextIteratorStreamer(
 
72
  attention_mask=attention_mask.to(device),
73
  streamer=streamer,
74
  do_sample=True,
75
+ temperature=temper,
76
+ max_new_tokens=max_tokens,
77
  top_k=top_k,
78
+ repetition_penalty=repeat_penalty,
79
  top_p=top_p,
80
  )
81
  t = Thread(target=model.generate, kwargs=generate_kwargs)
 
94
 
95
 
96
  def DeepSeek_R1_Qwen_7B():
97
+ with gr.Accordion(label=_L("⚙️ 参数设置"), open=False) as ds_acc:
98
  prompt = gr.Textbox(
99
  "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
100
+ label=_L("系统提示词"),
101
  )
102
+ temper = gr.Slider(0, 1, 0.6, label=_L("温度参数"))
103
+ maxtoken = gr.Slider(0, 32000, 10000, label=_L("最大 token "))
104
+ topk = gr.Slider(1, 80, 40, label=_L("Top-K 采样"))
105
+ repet = gr.Slider(0, 2, 1.1, label=_L("重复性惩罚"))
106
+ topp = gr.Slider(0, 1, 0.95, label=_L("Top-P 采样"))
107
 
108
  return gr.ChatInterface(
109
  predict,
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- torch
 
2
  openai
3
  accelerate
4
- transformers
5
- huggingface_hub==0.25.2
 
1
+ torch==2.6.0+cu118
2
+ -f https://mirrors.aliyun.com/pytorch-wheels/cu118
3
  openai
4
  accelerate
5
+ modelscope[framework]==1.24.0
 
utils.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import os
2
+
3
+ EN_US = os.getenv("LANG") != "zh_CN.UTF-8"