|
--- |
|
base_model: |
|
- Dream-org/Dream-v0-Instruct-7B |
|
--- |
|
|
|
# System Requirement |
|
Recommended to hace over 10GB VRAM (Test Round consumes over 9GB VRAM). |
|
|
|
# Pre-requirements |
|
- transformers==4.46.2 |
|
- torch==2.5.1 |
|
- bitsandbytes |
|
- accelerate |
|
## Single Chat |
|
|
|
Here's how to load and use the quantized model: |
|
|
|
```python |
|
from transformers import AutoModel, AutoTokenizer |
|
|
|
model_path = "Rainnighttram/Dream-v0-Instruct-7B-4bit" |
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
|
model = AutoModel.from_pretrained( |
|
model_path, |
|
device_map="auto", |
|
trust_remote_code=True |
|
) |
|
model = model.to("cuda").eval() |
|
|
|
messages = [ |
|
{"role": "user", "content": "Please make comparisons between UHF and LF RFID."} |
|
] |
|
|
|
inputs = tokenizer.apply_chat_template( |
|
messages, return_tensors="pt", return_dict=True, add_generation_prompt=True |
|
) |
|
input_ids = inputs.input_ids.to(device="cuda") |
|
attention_mask = inputs.attention_mask.to(device="cuda") |
|
|
|
output = model.diffusion_generate( |
|
input_ids, |
|
attention_mask=attention_mask, |
|
max_new_tokens=512, |
|
output_history=True, |
|
return_dict_in_generate=True, |
|
steps=512, |
|
temperature=0.2, |
|
top_p=0.95, |
|
alg="entropy", |
|
alg_temp=0., |
|
) |
|
|
|
generations = [ |
|
tokenizer.decode(g[len(p) :].tolist()) |
|
for p, g in zip(input_ids, output.sequences) |
|
] |
|
|
|
print(generations[0].split(tokenizer.eos_token)[0]) |
|
``` |
|
|
|
# Multi-round Chat |
|
```python |
|
from transformers import AutoModel, AutoTokenizer |
|
|
|
def initialize_model(): |
|
model_path = "Rainnighttram/Dream-v0-Instruct-7B-4bit" |
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
|
model = AutoModel.from_pretrained( |
|
model_path, |
|
device_map="auto", |
|
trust_remote_code=True |
|
) |
|
model = model.to("cuda").eval() |
|
return model, tokenizer |
|
|
|
def generate_response(model, tokenizer, messages): |
|
inputs = tokenizer.apply_chat_template( |
|
messages, |
|
return_tensors="pt", |
|
return_dict=True, |
|
add_generation_prompt=True |
|
) |
|
|
|
input_ids = inputs.input_ids.to(device="cuda") |
|
attention_mask = inputs.attention_mask.to(device="cuda") |
|
|
|
output = model.diffusion_generate( |
|
input_ids, |
|
attention_mask=attention_mask, |
|
max_new_tokens=512, |
|
output_history=True, |
|
return_dict_in_generate=True, |
|
steps=512, |
|
temperature=0.2, |
|
top_p=0.95, |
|
alg="entropy", |
|
alg_temp=0., |
|
) |
|
|
|
generations = [ |
|
tokenizer.decode(g[len(p):].tolist()) |
|
for p, g in zip(input_ids, output.sequences) |
|
] |
|
|
|
return generations[0].split(tokenizer.eos_token)[0] |
|
|
|
def main(): |
|
print("Initializing model and tokenizer...") |
|
model, tokenizer = initialize_model() |
|
|
|
messages = [] |
|
|
|
print("Chat initialized. Type 'quit' to exit.") |
|
print("-" * 50) |
|
|
|
while True: |
|
user_input = input("\nYou: ").strip() |
|
|
|
if user_input.lower() == 'quit': |
|
print("\nEnding conversation. Goodbye!") |
|
break |
|
|
|
messages.append({"role": "user", "content": user_input}) |
|
|
|
print("\nAssistant: ", end="") |
|
response = generate_response(model, tokenizer, messages) |
|
print(response) |
|
|
|
messages.append({"role": "assistant", "content": response}) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
``` |