Update README.md
Browse files
README.md
CHANGED
@@ -88,4 +88,50 @@ vllm serve "th-nuernberg/DeepHermes-3-Mistral-24B-Preview-FP8-Dynamic" \
|
|
88 |
--trust-remote-code \
|
89 |
--enable-reasoning --reasoning-parser deepseek_r1 \
|
90 |
--max-model-len 32768 --quantization compressed-tensors
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
```
|
|
|
88 |
--trust-remote-code \
|
89 |
--enable-reasoning --reasoning-parser deepseek_r1 \
|
90 |
--max-model-len 32768 --quantization compressed-tensors
|
91 |
+
```
|
92 |
+
|
93 |
+
## Creation
|
94 |
+
|
95 |
+
This model was created with [llm-compressor](https://github.com/vllm-project/llm-compressor) by running the code snippet below.
|
96 |
+
|
97 |
+
|
98 |
+
```python
|
99 |
+
import argparse
|
100 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
101 |
+
from llmcompressor.modifiers.quantization import QuantizationModifier
|
102 |
+
from llmcompressor.transformers import oneshot
|
103 |
+
import os
|
104 |
+
|
105 |
+
def main():
|
106 |
+
parser = argparse.ArgumentParser(description='Quantize a transformer model to FP8')
|
107 |
+
parser.add_argument('--model_id', type=str, required=True,
|
108 |
+
help='The model ID from HuggingFace (e.g., "meta-llama/Meta-Llama-3-8B-Instruct")')
|
109 |
+
parser.add_argument('--save_path', type=str, default='.',
|
110 |
+
help='Custom path to save the quantized model. If not provided, will use model_name-FP8-dynamic')
|
111 |
+
args = parser.parse_args()
|
112 |
+
|
113 |
+
# Load model
|
114 |
+
model = AutoModelForCausalLM.from_pretrained(
|
115 |
+
args.model_id, device_map="auto", torch_dtype="auto", trust_remote_code=True,
|
116 |
+
)
|
117 |
+
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
|
118 |
+
|
119 |
+
# Configure the quantization algorithm and scheme
|
120 |
+
recipe = QuantizationModifier(
|
121 |
+
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
|
122 |
+
)
|
123 |
+
|
124 |
+
# Apply quantization
|
125 |
+
oneshot(model=model, recipe=recipe)
|
126 |
+
|
127 |
+
save_path = os.path.join(args.save_path, args.model_id.split("/")[1] + "-FP8-dynamic")
|
128 |
+
os.makedirs(save_path, exist_ok=True)
|
129 |
+
|
130 |
+
# Save to disk in compressed-tensors format
|
131 |
+
model.save_pretrained(save_path)
|
132 |
+
tokenizer.save_pretrained(save_path)
|
133 |
+
print(f"Model and tokenizer saved to: {save_path}")
|
134 |
+
|
135 |
+
if __name__ == "__main__":
|
136 |
+
main()
|
137 |
```
|