Spaces:
Sleeping
Sleeping
File size: 3,793 Bytes
7de1f3b a6bb80e a108bc1 0c9602f 92e3e51 0c9602f 1d5c7e3 0c9602f fd5c246 0c9602f a6bb80e 7de1f3b a6bb80e 5082b8d a6bb80e f308a75 a6bb80e a108bc1 a6bb80e a108bc1 7de1f3b a6bb80e 7de1f3b de8bf82 7de1f3b 378fb81 06a57b1 0291ccf f5e3b2c 06a57b1 de8bf82 7de1f3b de8bf82 a6bb80e de8bf82 a6bb80e a03b7d1 5082b8d a6bb80e 7d20d9a a6bb80e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from threading import Thread
import gradio as gr
import json
import subprocess
import os
def install_vllm_from_patch():
script_path = "./install.sh"
if not os.path.exists(script_path):
print(f"Error: install.sh is not exist!")
return False
try:
print(f"begin run install.sh")
result = subprocess.run(
["bash", script_path],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text = True,
timeout = 300
)
print(f"result: {result}")
return True
except Exception as e:
print(f"Error: {str(e)}")
return False
#install vllm from patch file
#install_vllm_from_patch()
# load model and tokenizer
model_name = "inclusionAI/Ling-mini-2.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
device_map="auto",
trust_remote_code=True
).eval()
def respond(
message,
history: list[dict[str, str]],
system_message,
max_tokens,
# temperature,
# top_p
):
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
#client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
if len(system_message) == 0:
system_message = "## 你是谁\n\n我是百灵(Ling),一个由蚂蚁集团(Ant Group) 开发的AI智能助手"
messages = [{"role": "system", "content": system_message}]
messages.extend(history)
messages.append({"role": "user", "content": message})
print(f"system_prompt: {json.dumps(messages, ensure_ascii=False, indent=2)}")
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
model_inputs = tokenizer([text], return_tensors="pt", return_token_type_ids=False).to(model.device)
print(f"max_new_tokens={max_tokens}")
model_inputs.update(
dict(max_new_tokens=max_tokens,
streamer = streamer,
# temperature = 0.7,
# top_p = 1,
# presence_penalty = 1.5,
)
)
# Start a separate thread for model generation to allow streaming output
thread = Thread(
target=model.generate,
kwargs=model_inputs,
)
thread.start()
# Accumulate and yield text tokens as they are generated
acc_text = ""
for text_token in streamer:
acc_text += text_token # Append the generated token to the accumulated text
yield acc_text # Yield the accumulated text
# Ensure the generation thread completes
thread.join()
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
chatbot = gr.ChatInterface(
respond,
type="messages",
additional_inputs=[
gr.Textbox(value="", label="System message"),
gr.Slider(minimum=1, maximum=32000, value=512, step=1, label="Max new tokens"),
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
# gr.Slider(
# minimum=0.1,
# maximum=1.0,
# value=0.95,
# step=0.05,
# label="Top-p (nucleus sampling)",
# ),
],
)
with gr.Blocks() as demo:
# with gr.Sidebar():
# gr.LoginButton()
chatbot.render()
if __name__ == "__main__":
demo.launch()
|