tomrance commited on
Commit
45df6c8
·
verified ·
1 Parent(s): a165cec

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +46 -0
README.md CHANGED
@@ -88,4 +88,50 @@ vllm serve "th-nuernberg/DeepHermes-3-Mistral-24B-Preview-FP8-Dynamic" \
88
  --trust-remote-code \
89
  --enable-reasoning --reasoning-parser deepseek_r1 \
90
  --max-model-len 32768 --quantization compressed-tensors
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  ```
 
88
  --trust-remote-code \
89
  --enable-reasoning --reasoning-parser deepseek_r1 \
90
  --max-model-len 32768 --quantization compressed-tensors
91
+ ```
92
+
93
+ ## Creation
94
+
95
+ This model was created with [llm-compressor](https://github.com/vllm-project/llm-compressor) by running the code snippet below.
96
+
97
+
98
+ ```python
99
+ import argparse
100
+ from transformers import AutoModelForCausalLM, AutoTokenizer
101
+ from llmcompressor.modifiers.quantization import QuantizationModifier
102
+ from llmcompressor.transformers import oneshot
103
+ import os
104
+
105
+ def main():
106
+ parser = argparse.ArgumentParser(description='Quantize a transformer model to FP8')
107
+ parser.add_argument('--model_id', type=str, required=True,
108
+ help='The model ID from HuggingFace (e.g., "meta-llama/Meta-Llama-3-8B-Instruct")')
109
+ parser.add_argument('--save_path', type=str, default='.',
110
+ help='Custom path to save the quantized model. If not provided, will use model_name-FP8-dynamic')
111
+ args = parser.parse_args()
112
+
113
+ # Load model
114
+ model = AutoModelForCausalLM.from_pretrained(
115
+ args.model_id, device_map="auto", torch_dtype="auto", trust_remote_code=True,
116
+ )
117
+ tokenizer = AutoTokenizer.from_pretrained(args.model_id)
118
+
119
+ # Configure the quantization algorithm and scheme
120
+ recipe = QuantizationModifier(
121
+ targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
122
+ )
123
+
124
+ # Apply quantization
125
+ oneshot(model=model, recipe=recipe)
126
+
127
+ save_path = os.path.join(args.save_path, args.model_id.split("/")[1] + "-FP8-dynamic")
128
+ os.makedirs(save_path, exist_ok=True)
129
+
130
+ # Save to disk in compressed-tensors format
131
+ model.save_pretrained(save_path)
132
+ tokenizer.save_pretrained(save_path)
133
+ print(f"Model and tokenizer saved to: {save_path}")
134
+
135
+ if __name__ == "__main__":
136
+ main()
137
  ```