Siddharth63 commited on
Commit
1ff9530
·
verified ·
1 Parent(s): 9ef794d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +66 -3
README.md CHANGED
@@ -1,3 +1,66 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ ```
5
+ git clone https://github.com/casper-hansen/AutoAWQ.git # latest source 2025-05-01
6
+ cd AutoAWQ
7
+ pip install -e .
8
+ ## go into AutoAWQ folder
9
+ pip install --upgrade transformers
10
+
11
+ ## FOR STREAMING
12
+ from awq import AutoAWQForCausalLM
13
+ from transformers import AutoTokenizer, TextStreamer
14
+ from awq.utils.utils import get_best_device
15
+
16
+ device = get_best_device()
17
+ quant_path = "Siddharth63/Qwen3-4B-base-AWQ" # path or HF repo for the AWQ checkpoint
18
+
19
+ # ---------- load model & tokenizer ----------
20
+ model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
21
+ tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
22
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
23
+
24
+ # ---------- tokenise & generate ----------
25
+ input_ids = tokenizer("Atherosclerosis is", return_tensors="pt"
26
+ ).input_ids.to(device)
27
+
28
+ _ = model.generate(
29
+ input_ids,
30
+ streamer = streamer,
31
+ max_new_tokens = 512, # full context window
32
+ use_cache = True
33
+ )
34
+
35
+
36
+ ## FOR NON_STREAMING
37
+ from awq import AutoAWQForCausalLM
38
+ from transformers import AutoTokenizer, TextStreamer
39
+ from awq.utils.utils import get_best_device
40
+
41
+ device = get_best_device()
42
+ quant_path = "Siddharth63/Qwen3-4B-base-AWQ" # path or HF repo for the AWQ checkpoint
43
+
44
+ # ---------- load model & tokenizer ----------
45
+ model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
46
+ tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
47
+
48
+ input_ids = tokenizer(
49
+ "Atherosclerosis is",
50
+ return_tensors="pt"
51
+ ).input_ids.to(device)
52
+
53
+ # ---------- generate (blocking) ----------
54
+ output_ids = model.generate(
55
+ input_ids,
56
+ max_new_tokens=100, # or max_length / temperature / etc.
57
+ use_cache=True # default; speeds up incremental decoding
58
+ )
59
+
60
+ response = tokenizer.decode(
61
+ output_ids[0],
62
+ skip_special_tokens=True, # drop <|im_start|> tokens
63
+ )
64
+
65
+ print("\n=== Model reply ===\n", response)
66
+ ```