File size: 3,378 Bytes
e37e48c
 
 
b602306
 
09c7b8d
 
 
 
 
 
af02e00
 
82168b6
b602306
 
 
 
 
 
8219bdd
b602306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35874da
 
 
82168b6
35874da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
---
base_model:
- Dream-org/Dream-v0-Instruct-7B
---

# System Requirement
Recommended to hace over 10GB VRAM (Test Round consumes over 9GB VRAM). 

# Pre-requirements
- transformers==4.46.2
- torch==2.5.1
- bitsandbytes
- accelerate
## Single Chat

Here's how to load and use the quantized model:

```python
from transformers import AutoModel, AutoTokenizer

model_path = "Rainnighttram/Dream-v0-Instruct-7B-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(
     model_path,
     device_map="auto",
     trust_remote_code=True
)
model = model.to("cuda").eval()

messages = [
     {"role": "user", "content": "Please make comparisons between UHF and LF RFID."}
 ]

inputs = tokenizer.apply_chat_template(
     messages, return_tensors="pt", return_dict=True, add_generation_prompt=True
 )
input_ids = inputs.input_ids.to(device="cuda")
attention_mask = inputs.attention_mask.to(device="cuda")

output = model.diffusion_generate(
     input_ids,
     attention_mask=attention_mask,
     max_new_tokens=512,
     output_history=True,
     return_dict_in_generate=True,
     steps=512,
     temperature=0.2,
     top_p=0.95,
     alg="entropy",
     alg_temp=0.,
 )

generations = [
     tokenizer.decode(g[len(p) :].tolist())
     for p, g in zip(input_ids, output.sequences)
 ]

print(generations[0].split(tokenizer.eos_token)[0])
```

# Multi-round Chat
```python
from transformers import AutoModel, AutoTokenizer

def initialize_model():
    model_path = "Rainnighttram/Dream-v0-Instruct-7B-4bit"
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    model = AutoModel.from_pretrained(
        model_path,
        device_map="auto",
        trust_remote_code=True
    )
    model = model.to("cuda").eval()
    return model, tokenizer

def generate_response(model, tokenizer, messages):
    inputs = tokenizer.apply_chat_template(
        messages, 
        return_tensors="pt", 
        return_dict=True, 
        add_generation_prompt=True
    )
    
    input_ids = inputs.input_ids.to(device="cuda")
    attention_mask = inputs.attention_mask.to(device="cuda")

    output = model.diffusion_generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=512,
        output_history=True,
        return_dict_in_generate=True,
        steps=512,
        temperature=0.2,
        top_p=0.95,
        alg="entropy",
        alg_temp=0.,
    )

    generations = [
        tokenizer.decode(g[len(p):].tolist())
        for p, g in zip(input_ids, output.sequences)
    ]

    return generations[0].split(tokenizer.eos_token)[0]

def main():
    print("Initializing model and tokenizer...")
    model, tokenizer = initialize_model()
    
    messages = []
    
    print("Chat initialized. Type 'quit' to exit.")
    print("-" * 50)
    
    while True:
        user_input = input("\nYou: ").strip()
        
        if user_input.lower() == 'quit':
            print("\nEnding conversation. Goodbye!")
            break
        
        messages.append({"role": "user", "content": user_input})
        
        print("\nAssistant: ", end="")
        response = generate_response(model, tokenizer, messages)
        print(response)
        
        messages.append({"role": "assistant", "content": response})

if __name__ == "__main__":
    main()
```