Spaces:

Genius-Society
/

llms

Running

App Files Files

admin commited on Apr 24

Commit

be22f80

1 Parent(s): cd40f0e

sync ms

Browse files

Files changed (5) hide show

app.py +17 -5
apis.py → modules/apis.py +44 -57
deepseek.py → modules/deepseek.py +46 -28
requirements.txt +3 -3
utils.py +3 -0

app.py CHANGED Viewed

@@ -1,14 +1,26 @@
 import gradio as gr
-from apis import LLM_APIs
-from deepseek import DeepSeek_R1_Qwen_7B
 if __name__ == "__main__":
     with gr.Blocks() as demo:
-        gr.Markdown("# LLM Deployment Instances")
-        with gr.Tab("API Aggregation"):
             LLM_APIs()
-        with gr.Tab("Real DeepSeek R1 Qwen 7B"):
             DeepSeek_R1_Qwen_7B()
     demo.launch()

 import gradio as gr
+from modules.apis import LLM_APIs
+from modules.deepseek import DeepSeek_R1_Qwen_7B
+from utils import EN_US
+ZH2EN = {
+    "# 大模型部署实例合集": "# LLM Deployment Instances",
+    "API 部署聚合": "API Aggregation",
+    "真实 DeepSeek R1 Qwen 7B 模型": "Real DeepSeek R1 Qwen 7B",
+}
+def _L(zh_txt: str):
+    return ZH2EN[zh_txt] if EN_US else zh_txt
 if __name__ == "__main__":
     with gr.Blocks() as demo:
+        gr.Markdown(_L("# 大模型部署实例合集"))
+        with gr.Tab(_L("API 部署聚合")):
             LLM_APIs()
+        with gr.Tab(_L("真实 DeepSeek R1 Qwen 7B 模型")):
             DeepSeek_R1_Qwen_7B()
     demo.launch()

apis.py → modules/apis.py RENAMED Viewed

@@ -1,30 +1,35 @@
 import os
 import gradio as gr
 from openai import OpenAI
-def predict(
-    message,
-    history,
-    system_prompt,
-    model,
-    api_url,
-    api_key,
-    max_tk,
-    temp,
-    top_p,
-):
-    if not api_key:
-        return "Please set valid api keys in settings first."
-    # Format history with a given chat template
-    msgs = [{"role": "system", "content": system_prompt}]
-    for user, assistant in history:
-        msgs.append({"role": "user", "content": user})
-        msgs.append({"role": "system", "content": assistant})
-    msgs.append({"role": "user", "content": message})
     try:
         client = OpenAI(api_key=api_key, base_url=api_url)
         response = client.chat.completions.create(
             model=model,
@@ -41,16 +46,7 @@ def predict(
     return response
-def deepseek(
-    message,
-    history,
-    model,
-    api_key,
-    system_prompt,
-    max_tk,
-    temp,
-    top_p,
-):
     response = predict(
         message,
         history,
@@ -68,16 +64,7 @@ def deepseek(
         yield "".join(outputs)
-def kimi(
-    message,
-    history,
-    model,
-    api_key,
-    system_prompt,
-    max_tk,
-    temp,
-    top_p,
-):
     response = predict(
         message,
         history,
@@ -96,26 +83,26 @@ def kimi(
 def LLM_APIs():
-    with gr.Blocks() as llms:
         with gr.Tab("DeepSeek"):
-            with gr.Accordion(label="⚙️ Settings", open=False) as ds_acc:
                 ds_model = gr.Dropdown(
                     choices=["deepseek-chat", "deepseek-reasoner"],
                     value="deepseek-chat",
-                    label="Select a model",
                 )
                 ds_key = gr.Textbox(
                     os.getenv("ds_api_key"),
                     type="password",
-                    label="API key",
                 )
                 ds_sys = gr.Textbox(
                     "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
-                    label="System prompt",
                 )
-                ds_maxtk = gr.Slider(0, 32000, 10000, label="Max new tokens")
-                ds_temp = gr.Slider(0, 1, 0.3, label="Temperature")
-                ds_topp = gr.Slider(0, 1, 0.95, label="Top P sampling")
             gr.ChatInterface(
                 deepseek,
@@ -130,24 +117,24 @@ def LLM_APIs():
             )
         with gr.Tab("Kimi"):
-            with gr.Accordion(label="⚙️ Settings", open=False) as kimi_acc:
                 kimi_model = gr.Dropdown(
                     choices=["moonshot-v1-8k", "moonshot-v1-32k", "moonshot-v1-128k"],
                     value="moonshot-v1-32k",
-                    label="Select a model",
                 )
                 kimi_key = gr.Textbox(
                     os.getenv("kimi_api_key"),
                     type="password",
-                    label="API key",
                 )
                 kimi_sys = gr.Textbox(
                     "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
-                    label="System prompt",
                 )
-                kimi_maxtk = gr.Slider(0, 32000, 10000, label="Max new tokens")
-                kimi_temp = gr.Slider(0, 1, 0.3, label="Temperature")
-                kimi_topp = gr.Slider(0, 1, 0.95, label="Top P sampling")
             gr.ChatInterface(
                 kimi,
@@ -161,4 +148,4 @@ def LLM_APIs():
                 ],
             )
-    return llms.queue()

 import os
 import gradio as gr
 from openai import OpenAI
+from utils import EN_US
+ZH2EN = {
+    "请先在设置中配置有效 API 密钥": "Please set valid api keys in settings first.",
+    "⚙️ 设置": "⚙️ Settings",
+    "模型选择": "Select a model",
+    "API 密钥": "API key",
+    "系统提示词": "System prompt",
+    "最大 token 数": "Max new tokens",
+    "温度参数": "Temperature",
+    "Top-P 采样": "Top P sampling",
+}
+def _L(zh_txt: str):
+    return ZH2EN[zh_txt] if EN_US else zh_txt
+def predict(msg, history, system_prompt, model, api_url, api_key, max_tk, temp, top_p):
     try:
+        if not api_key:
+            raise ValueError(_L("请先在设置中配置有效 API 密钥"))
+        msgs = [{"role": "system", "content": system_prompt}]
+        for user, assistant in history:
+            msgs.append({"role": "user", "content": user})
+            msgs.append({"role": "system", "content": assistant})
+        msgs.append({"role": "user", "content": msg})
         client = OpenAI(api_key=api_key, base_url=api_url)
         response = client.chat.completions.create(
             model=model,
     return response
+def deepseek(message, history, model, api_key, system_prompt, max_tk, temp, top_p):
     response = predict(
         message,
         history,
         yield "".join(outputs)
+def kimi(message, history, model, api_key, system_prompt, max_tk, temp, top_p):
     response = predict(
         message,
         history,
 def LLM_APIs():
+    with gr.Blocks() as apis:
         with gr.Tab("DeepSeek"):
+            with gr.Accordion(label=_L("⚙️ 设置"), open=False) as ds_acc:
                 ds_model = gr.Dropdown(
                     choices=["deepseek-chat", "deepseek-reasoner"],
                     value="deepseek-chat",
+                    label=_L("模型选择"),
                 )
                 ds_key = gr.Textbox(
                     os.getenv("ds_api_key"),
                     type="password",
+                    label=_L("API 密钥"),
                 )
                 ds_sys = gr.Textbox(
                     "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
+                    label=_L("系统提示词"),
                 )
+                ds_maxtk = gr.Slider(0, 32000, 10000, label=_L("最大 token 数"))
+                ds_temp = gr.Slider(0, 1, 0.3, label=_L("温度参数"))
+                ds_topp = gr.Slider(0, 1, 0.95, label=_L("Top-P 采样"))
             gr.ChatInterface(
                 deepseek,
             )
         with gr.Tab("Kimi"):
+            with gr.Accordion(label=_L("⚙️ 设置"), open=False) as kimi_acc:
                 kimi_model = gr.Dropdown(
                     choices=["moonshot-v1-8k", "moonshot-v1-32k", "moonshot-v1-128k"],
                     value="moonshot-v1-32k",
+                    label=_L("模型选择"),
                 )
                 kimi_key = gr.Textbox(
                     os.getenv("kimi_api_key"),
                     type="password",
+                    label=_L("API 密钥"),
                 )
                 kimi_sys = gr.Textbox(
                     "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
+                    label=_L("系统提示词"),
                 )
+                kimi_maxtk = gr.Slider(0, 32000, 10000, label=_L("最大 token 数"))
+                kimi_temp = gr.Slider(0, 1, 0.3, label=_L("温度参数"))
+                kimi_topp = gr.Slider(0, 1, 0.95, label=_L("Top-P 采样"))
             gr.ChatInterface(
                 kimi,
                 ],
             )
+    return apis.queue()

deepseek.py → modules/deepseek.py RENAMED Viewed

@@ -1,41 +1,59 @@
 import torch
 import gradio as gr
 from threading import Thread
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
 MODEL_NAME = MODEL_ID.split("/")[-1]
 CONTEXT_LENGTH = 16000
-DESCRIPTION = f"This is a HuggingFace deployment instance of {MODEL_NAME} model, if you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 if device == torch.device("cuda"):
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
-def predict(
-    message,
-    history,
-    system_prompt,
-    temperature,
-    max_new_tokens,
-    top_k,
-    repetition_penalty,
-    top_p,
-):
     # Format history with a given chat template
     stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
-    instruction = "<|im_start|>system\n" + system_prompt + "\n<|im_end|>\n"
     for user, assistant in history:
         instruction += f"<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n"
-    instruction += f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n"
     try:
         if device == torch.device("cpu"):
             raise EnvironmentError(
-                "If you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
             )
         streamer = TextIteratorStreamer(
@@ -54,10 +72,10 @@ def predict(
             attention_mask=attention_mask.to(device),
             streamer=streamer,
             do_sample=True,
-            temperature=temperature,
-            max_new_tokens=max_new_tokens,
             top_k=top_k,
-            repetition_penalty=repetition_penalty,
             top_p=top_p,
         )
         t = Thread(target=model.generate, kwargs=generate_kwargs)
@@ -76,16 +94,16 @@ def predict(
 def DeepSeek_R1_Qwen_7B():
-    with gr.Accordion(label="⚙️ Parameters", open=False) as ds_acc:
         prompt = gr.Textbox(
             "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
-            label="System prompt",
         )
-        temper = gr.Slider(0, 1, 0.6, label="Temperature")
-        maxtoken = gr.Slider(0, 32000, 10000, label="Max new tokens")
-        topk = gr.Slider(1, 80, 40, label="Top K sampling")
-        repet = gr.Slider(0, 2, 1.1, label="Repetition penalty")
-        topp = gr.Slider(0, 1, 0.95, label="Top P sampling")
     return gr.ChatInterface(
         predict,

 import torch
+import modelscope
+import huggingface_hub
 import gradio as gr
 from threading import Thread
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from utils import EN_US
+ZH2EN = {
+    "有算力的可自行克隆至本地或复刻至购买了 GPU 环境的账号测试": "If you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment",
+    "⚙️ 参数设置": "⚙️ Parameters",
+    "系统提示词": "System prompt",
+    "最大 token 数": "Max new tokens",
+    "温度参数": "Temperature",
+    "Top-K 采样": "Top K sampling",
+    "Top-P 采样": "Top P sampling",
+    "重复性惩罚": "Repetition penalty",
+}
+def _L(zh_txt: str):
+    return ZH2EN[zh_txt] if EN_US else zh_txt
 MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
 MODEL_NAME = MODEL_ID.split("/")[-1]
 CONTEXT_LENGTH = 16000
+DESCRIPTION = (
+    f"This is a HuggingFace deployment instance of {MODEL_NAME} model, if you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
+    if EN_US
+    else f"当前仅提供 {MODEL_NAME} 模型的 ModelScope 版部署实例，有算力的可自行克隆至本地或复刻至购买了 GPU 环境的账号测试"
+)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 if device == torch.device("cuda"):
+    MODEL_DIR = (
+        huggingface_hub.snapshot_download(MODEL_ID, cache_dir="./__pycache__")
+        if EN_US
+        else modelscope.snapshot_download(MODEL_ID, cache_dir="./__pycache__")
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_DIR, device_map="auto")
+def predict(msg, history, prompt, temper, max_tokens, top_k, repeat_penalty, top_p):
     # Format history with a given chat template
     stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
+    instruction = "<|im_start|>system\n" + prompt + "\n<|im_end|>\n"
     for user, assistant in history:
         instruction += f"<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n"
+    instruction += f"<|im_start|>user\n{msg}\n<|im_end|>\n<|im_start|>assistant\n"
     try:
         if device == torch.device("cpu"):
             raise EnvironmentError(
+                _L("有算力的可自行克隆至本地或复刻至购买了 GPU 环境的账号测试")
             )
         streamer = TextIteratorStreamer(
             attention_mask=attention_mask.to(device),
             streamer=streamer,
             do_sample=True,
+            temperature=temper,
+            max_new_tokens=max_tokens,
             top_k=top_k,
+            repetition_penalty=repeat_penalty,
             top_p=top_p,
         )
         t = Thread(target=model.generate, kwargs=generate_kwargs)
 def DeepSeek_R1_Qwen_7B():
+    with gr.Accordion(label=_L("⚙️ 参数设置"), open=False) as ds_acc:
         prompt = gr.Textbox(
             "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
+            label=_L("系统提示词"),
         )
+        temper = gr.Slider(0, 1, 0.6, label=_L("温度参数"))
+        maxtoken = gr.Slider(0, 32000, 10000, label=_L("最大 token 数"))
+        topk = gr.Slider(1, 80, 40, label=_L("Top-K 采样"))
+        repet = gr.Slider(0, 2, 1.1, label=_L("重复性惩罚"))
+        topp = gr.Slider(0, 1, 0.95, label=_L("Top-P 采样"))
     return gr.ChatInterface(
         predict,

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-torch
 openai
 accelerate
-transformers
-huggingface_hub==0.25.2

+torch==2.6.0+cu118
+-f https://mirrors.aliyun.com/pytorch-wheels/cu118
 openai
 accelerate
+modelscope[framework]==1.24.0

utils.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import os
2	+
3	+ EN_US = os.getenv("LANG") != "zh_CN.UTF-8"