File size: 2,406 Bytes
4667b5a 769b533 60faee4 6d0db7f 1d873a4 07c2cc6 1d873a4 3a75081 1d873a4 5fb8783 1d873a4 5fb8783 1d873a4 07c2cc6 1d873a4 6d0db7f 1d873a4 6d0db7f 3f81f1c 6d0db7f 3f81f1c 60faee4 1d873a4 60faee4 8314b3e c72fd59 60faee4 1d873a4 3f81f1c 60faee4 1d873a4 60faee4 1d873a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import spaces
import threading
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
# Load the model and tokenizer locally
model_name = "kz919/QwQ-0.5B-Distilled-SFT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")
# Define the function to handle chat responses
@spaces.GPU
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
# Prepare the prompt by combining history and system messages
msg = [
{"role": "system", "content": system_message}
]
for user_input, assistant_response in history:
msg.extend(
{"role": "user", "content": user_input},
{"role": "assistant", "content": assistant_response}
)
msg.append({"role": "user", "content": message})
prompt = tokenizer.apply_chat_template(
msg,
tokenize=False,
add_generation_prompt=True
)
# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Use a thread to run the generation in parallel
generation_thread = threading.Thread(
target=model.generate,
kwargs=dict(
inputs=inputs.input_ids,
max_length=max_tokens,
streamer=streamer,
do_sample=True,
temperature=temperature,
top_p=top_p,
pad_token_id=tokenizer.eos_token_id,
),
)
generation_thread.start()
# Stream the tokens as they are generated
text_buffer = ""
for new_text in streamer:
text_buffer+=new_text
yield text_buffer
# Create the Gradio interface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.", label="System message"),
gr.Slider(minimum=1, maximum=16384, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
]
)
# Launch the Gradio app
if __name__ == "__main__":
demo.launch()
|