suayptalha commited on
Commit
c03785c
·
verified ·
1 Parent(s): 0d77336

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -0
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ import torch
3
+ from transformers import (
4
+ AutoModelForCausalLM,
5
+ AutoTokenizer,
6
+ TextIteratorStreamer,
7
+ )
8
+ import gradio as gr
9
+
10
+ # Load model and tokenizer
11
+ model_id = "microsoft/bitnet-b1.58-2B-4T"
12
+
13
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
14
+ model = AutoModelForCausalLM.from_pretrained(
15
+ model_id,
16
+ torch_dtype=torch.bfloat16,
17
+ device_map="auto"
18
+ )
19
+
20
+ def respond(
21
+ message: str,
22
+ history: list[tuple[str, str]],
23
+ system_message: str,
24
+ max_tokens: int,
25
+ temperature: float,
26
+ top_p: float,
27
+ ):
28
+ """
29
+ Generate a chat response using streaming with TextIteratorStreamer.
30
+
31
+ Args:
32
+ message: User's current message.
33
+ history: List of (user, assistant) tuples from previous turns.
34
+ system_message: Initial system prompt guiding the assistant.
35
+ max_tokens: Maximum number of tokens to generate.
36
+ temperature: Sampling temperature.
37
+ top_p: Nucleus sampling probability.
38
+
39
+ Yields:
40
+ The growing response text as new tokens are generated.
41
+ """
42
+ # Assemble messages
43
+ messages = [{"role": "system", "content": system_message}]
44
+ for user_msg, bot_msg in history:
45
+ if user_msg:
46
+ messages.append({"role": "user", "content": user_msg})
47
+ if bot_msg:
48
+ messages.append({"role": "assistant", "content": bot_msg})
49
+ messages.append({"role": "user", "content": message})
50
+
51
+ # Prepare prompt and tokenize
52
+ prompt = tokenizer.apply_chat_template(
53
+ messages, tokenize=False, add_generation_prompt=True
54
+ )
55
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
56
+
57
+ # Set up streamer for real-time output
58
+ streamer = TextIteratorStreamer(
59
+ tokenizer, skip_prompt=True, skip_special_tokens=True
60
+ )
61
+ generate_kwargs = dict(
62
+ **inputs,
63
+ streamer=streamer,
64
+ max_new_tokens=max_tokens,
65
+ temperature=temperature,
66
+ top_p=top_p,
67
+ do_sample=True,
68
+ )
69
+ # Start generation in a separate thread
70
+ thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
71
+ thread.start()
72
+
73
+ # Stream tokens back to user
74
+ response = ""
75
+ for new_text in streamer:
76
+ response += new_text
77
+ yield response
78
+
79
+ # Initialize Gradio chat interface
80
+
81
+ demo = gr.ChatInterface(
82
+ fn=respond,
83
+ title="Bitnet-b1.58-2B-4T Chatbot",
84
+ description="This chat application is powered by Microsoft BitNet-B1 and designed for natural conversations.",
85
+ examples=[
86
+ # Each example: [message, system_message, max_new_tokens, temperature, top_p]
87
+ [
88
+ "Hello! How are you?",
89
+ "You are a helpful AI assistant.",
90
+ 512,
91
+ 0.7,
92
+ 0.95,
93
+ ],
94
+ [
95
+ "Can you code a snake game in Python?",
96
+ "You are a helpful AI assistant.",
97
+ 512,
98
+ 0.7,
99
+ 0.95,
100
+ ],
101
+ ],
102
+ additional_inputs=[
103
+ gr.Textbox(
104
+ value="You are a helpful AI assistant.",
105
+ label="System message"
106
+ ),
107
+ gr.Slider(
108
+ minimum=1,
109
+ maximum=2048,
110
+ value=512,
111
+ step=1,
112
+ label="Max new tokens"
113
+ ),
114
+ gr.Slider(
115
+ minimum=0.1,
116
+ maximum=4.0,
117
+ value=0.7,
118
+ step=0.1,
119
+ label="Temperature"
120
+ ),
121
+ gr.Slider(
122
+ minimum=0.1,
123
+ maximum=1.0,
124
+ value=0.95,
125
+ step=0.05,
126
+ label="Top-p (nucleus sampling)"
127
+ ),
128
+ ],
129
+ )
130
+
131
+ if __name__ == "__main__":
132
+ demo.launch()