File size: 3,378 Bytes
e37e48c b602306 09c7b8d af02e00 82168b6 b602306 8219bdd b602306 35874da 82168b6 35874da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
---
base_model:
- Dream-org/Dream-v0-Instruct-7B
---
# System Requirement
Recommended to hace over 10GB VRAM (Test Round consumes over 9GB VRAM).
# Pre-requirements
- transformers==4.46.2
- torch==2.5.1
- bitsandbytes
- accelerate
## Single Chat
Here's how to load and use the quantized model:
```python
from transformers import AutoModel, AutoTokenizer
model_path = "Rainnighttram/Dream-v0-Instruct-7B-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_path,
device_map="auto",
trust_remote_code=True
)
model = model.to("cuda").eval()
messages = [
{"role": "user", "content": "Please make comparisons between UHF and LF RFID."}
]
inputs = tokenizer.apply_chat_template(
messages, return_tensors="pt", return_dict=True, add_generation_prompt=True
)
input_ids = inputs.input_ids.to(device="cuda")
attention_mask = inputs.attention_mask.to(device="cuda")
output = model.diffusion_generate(
input_ids,
attention_mask=attention_mask,
max_new_tokens=512,
output_history=True,
return_dict_in_generate=True,
steps=512,
temperature=0.2,
top_p=0.95,
alg="entropy",
alg_temp=0.,
)
generations = [
tokenizer.decode(g[len(p) :].tolist())
for p, g in zip(input_ids, output.sequences)
]
print(generations[0].split(tokenizer.eos_token)[0])
```
# Multi-round Chat
```python
from transformers import AutoModel, AutoTokenizer
def initialize_model():
model_path = "Rainnighttram/Dream-v0-Instruct-7B-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_path,
device_map="auto",
trust_remote_code=True
)
model = model.to("cuda").eval()
return model, tokenizer
def generate_response(model, tokenizer, messages):
inputs = tokenizer.apply_chat_template(
messages,
return_tensors="pt",
return_dict=True,
add_generation_prompt=True
)
input_ids = inputs.input_ids.to(device="cuda")
attention_mask = inputs.attention_mask.to(device="cuda")
output = model.diffusion_generate(
input_ids,
attention_mask=attention_mask,
max_new_tokens=512,
output_history=True,
return_dict_in_generate=True,
steps=512,
temperature=0.2,
top_p=0.95,
alg="entropy",
alg_temp=0.,
)
generations = [
tokenizer.decode(g[len(p):].tolist())
for p, g in zip(input_ids, output.sequences)
]
return generations[0].split(tokenizer.eos_token)[0]
def main():
print("Initializing model and tokenizer...")
model, tokenizer = initialize_model()
messages = []
print("Chat initialized. Type 'quit' to exit.")
print("-" * 50)
while True:
user_input = input("\nYou: ").strip()
if user_input.lower() == 'quit':
print("\nEnding conversation. Goodbye!")
break
messages.append({"role": "user", "content": user_input})
print("\nAssistant: ", end="")
response = generate_response(model, tokenizer, messages)
print(response)
messages.append({"role": "assistant", "content": response})
if __name__ == "__main__":
main()
``` |