Siddharth63 commited on
Commit
3d0505d
·
verified ·
1 Parent(s): 42f9512

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +67 -3
README.md CHANGED
@@ -1,3 +1,67 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ ```
6
+ git clone https://github.com/casper-hansen/AutoAWQ.git # latest source 2025-05-01
7
+ cd AutoAWQ
8
+ pip install -e .
9
+ ## go into AutoAWQ folder
10
+ pip install --upgrade transformers
11
+
12
+ ## FOR STREAMING
13
+ from awq import AutoAWQForCausalLM
14
+ from transformers import AutoTokenizer, TextStreamer
15
+ from awq.utils.utils import get_best_device
16
+
17
+ device = get_best_device()
18
+ quant_path = "Siddharth63/Qwen3-8B-base-AWQ" # path or HF repo for the AWQ checkpoint
19
+
20
+ # ---------- load model & tokenizer ----------
21
+ model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
22
+ tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
23
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
24
+
25
+ # ---------- tokenise & generate ----------
26
+ input_ids = tokenizer("Atherosclerosis is", return_tensors="pt"
27
+ ).input_ids.to(device)
28
+
29
+ _ = model.generate(
30
+ input_ids,
31
+ streamer = streamer,
32
+ max_new_tokens = 512, # full context window
33
+ use_cache = True
34
+ )
35
+
36
+
37
+ ## FOR NON_STREAMING
38
+ from awq import AutoAWQForCausalLM
39
+ from transformers import AutoTokenizer, TextStreamer
40
+ from awq.utils.utils import get_best_device
41
+
42
+ device = get_best_device()
43
+ quant_path = "Siddharth63/Qwen3-8B-base-AWQ" # path or HF repo for the AWQ checkpoint
44
+
45
+ # ---------- load model & tokenizer ----------
46
+ model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
47
+ tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
48
+
49
+ input_ids = tokenizer(
50
+ "Atherosclerosis is",
51
+ return_tensors="pt"
52
+ ).input_ids.to(device)
53
+
54
+ # ---------- generate (blocking) ----------
55
+ output_ids = model.generate(
56
+ input_ids,
57
+ max_new_tokens=100, # or max_length / temperature / etc.
58
+ use_cache=True # default; speeds up incremental decoding
59
+ )
60
+
61
+ response = tokenizer.decode(
62
+ output_ids[0],
63
+ skip_special_tokens=True, # drop <|im_start|> tokens
64
+ )
65
+
66
+ print("\n=== Model reply ===\n", response)
67
+ ```