Quantised-models
Collection
23 items
•
Updated
git clone https://github.com/casper-hansen/AutoAWQ.git # latest source 2025-05-01
cd AutoAWQ
pip install -e .
## go into AutoAWQ folder
pip install --upgrade transformers
## FOR STREAMING
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer
from awq.utils.utils import get_best_device
device = get_best_device()
quant_path = "Siddharth63/Qwen3-4B-base-AWQ" # path or HF repo for the AWQ checkpoint
# ---------- load model & tokenizer ----------
model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# ---------- tokenise & generate ----------
input_ids = tokenizer("Atherosclerosis is", return_tensors="pt"
).input_ids.to(device)
_ = model.generate(
input_ids,
streamer = streamer,
max_new_tokens = 512, # full context window
use_cache = True
)
## FOR NON_STREAMING
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer
from awq.utils.utils import get_best_device
device = get_best_device()
quant_path = "Siddharth63/Qwen3-4B-base-AWQ" # path or HF repo for the AWQ checkpoint
# ---------- load model & tokenizer ----------
model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
input_ids = tokenizer(
"Atherosclerosis is",
return_tensors="pt"
).input_ids.to(device)
# ---------- generate (blocking) ----------
output_ids = model.generate(
input_ids,
max_new_tokens=100, # or max_length / temperature / etc.
use_cache=True # default; speeds up incremental decoding
)
response = tokenizer.decode(
output_ids[0],
skip_special_tokens=True, # drop <|im_start|> tokens
)
print("\n=== Model reply ===\n", response)