File size: 3,522 Bytes
baba7b1
 
 
21a124f
c03785c
 
b5341f2
 
 
c03785c
 
 
 
 
 
87775b4
c03785c
 
 
 
 
 
 
c3684ef
c03785c
64322ab
c03785c
87775b4
c03785c
 
 
 
 
 
 
 
b5341f2
 
 
 
 
 
 
 
 
 
 
 
 
 
c03785c
 
 
 
 
 
 
 
b5341f2
 
 
c03785c
 
b5341f2
 
 
 
c03785c
b5341f2
c03785c
 
 
b5341f2
c03785c
b5341f2
 
 
 
 
 
 
c03785c
 
 
 
4333e27
c03785c
 
 
77d121e
c03785c
 
 
 
 
 
77d121e
7f20953
c03785c
 
 
 
 
 
 
 
 
 
 
773a998
 
c03785c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5341f2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os

os.system("pip install git+https://github.com/shumingma/transformers.git")

import threading
import torch
import torch._dynamo
torch._dynamo.config.suppress_errors = True

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer,
)
import gradio as gr
import spaces

model_id = "microsoft/bitnet-b1.58-2B-4T"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
print(model.device)

@spaces.GPU
def respond(
    message: str,
    history: list[tuple[str, str]],
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
):
    """
    Generate a chat response using streaming with TextIteratorStreamer.

    Args:
        message: User's current message.
        history: List of (user, assistant) tuples from previous turns.
        system_message: Initial system prompt guiding the assistant.
        max_tokens: Maximum number of tokens to generate.
        temperature: Sampling temperature.
        top_p: Nucleus sampling probability.

    Yields:
        The growing response text as new tokens are generated.
    """
    messages = [{"role": "system", "content": system_message}]
    for user_msg, bot_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if bot_msg:
            messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": message})

    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    streamer = TextIteratorStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True
    )
    generate_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
    )
    thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
    thread.start()

    response = ""
    for new_text in streamer:
        response += new_text
        yield response

demo = gr.ChatInterface(
    fn=respond,
    title="Bitnet-b1.58-2B-4T Chatbot",
    description="This chat application is powered by Microsoft's SOTA Bitnet-b1.58-2B-4T and designed for natural and fast conversations.",
    examples=[
        [
            "Hello! How are you?",
            "You are a helpful AI assistant for everyday tasks.",
            512,
            0.7,
            0.95,
        ],
        [
            "Can you code a snake game in Python?",
            "You are a helpful AI assistant for coding.",
            2048,
            0.7,
            0.95,
        ],
    ],
    additional_inputs=[
        gr.Textbox(
            value="You are a helpful AI assistant.",
            label="System message"
        ),
        gr.Slider(
            minimum=1,
            maximum=8192,
            value=2048,
            step=1,
            label="Max new tokens"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=4.0,
            value=0.7,
            step=0.1,
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)"
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()