--- license: apache-2.0 --- ``` git clone https://github.com/casper-hansen/AutoAWQ.git # latest source 2025-05-01 cd AutoAWQ pip install -e . ## go into AutoAWQ folder pip install --upgrade transformers ## FOR STREAMING from awq import AutoAWQForCausalLM from transformers import AutoTokenizer, TextStreamer from awq.utils.utils import get_best_device device = get_best_device() quant_path = "Siddharth63/Qwen3-14B-base-AWQ" # path or HF repo for the AWQ checkpoint # ---------- load model & tokenizer ---------- model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True) tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True) streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # ---------- tokenise & generate ---------- input_ids = tokenizer("Atherosclerosis is", return_tensors="pt" ).input_ids.to(device) _ = model.generate( input_ids, streamer = streamer, max_new_tokens = 512, # full context window use_cache = True ) ## FOR NON_STREAMING from awq import AutoAWQForCausalLM from transformers import AutoTokenizer, TextStreamer from awq.utils.utils import get_best_device device = get_best_device() quant_path = "Siddharth63/Qwen3-14B-base-AWQ" # path or HF repo for the AWQ checkpoint # ---------- load model & tokenizer ---------- model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True) tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True) input_ids = tokenizer( "Atherosclerosis is", return_tensors="pt" ).input_ids.to(device) # ---------- generate (blocking) ---------- output_ids = model.generate( input_ids, max_new_tokens=100, # or max_length / temperature / etc. use_cache=True # default; speeds up incremental decoding ) response = tokenizer.decode( output_ids[0], skip_special_tokens=True, # drop <|im_start|> tokens ) print("\n=== Model reply ===\n", response) ```