creation

from transformers import AutoModelForCausalLM
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from accelerate import infer_auto_device_map, init_empty_weights

model_id  = "Qwen/Qwen3-32B"
model_out = model_id.split("/")[1] + ".w4a16"

device_map = []

with init_empty_weights():
    dummy_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="bfloat16")
    device_map = infer_auto_device_map(dummy_model, no_split_module_classes=dummy_model._no_split_modules)
    del dummy_model

for k, v in device_map.items():
    device_map[k] = 'cpu'

model = AutoModelForCausalLM.from_pretrained(
  model_id,
  device_map=device_map,
  torch_dtype="bfloat16",
)

recipe = QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"], dampening_frac=0.1)

oneshot(model=model, recipe=recipe, output_dir=model_out)
Downloads last month
137
Safetensors
Model size
5.7B params
Tensor type
I64
·
I32
·
BF16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for nytopop/Qwen3-32B.w4a16

Base model

Qwen/Qwen3-32B
Quantized
(49)
this model

Collection including nytopop/Qwen3-32B.w4a16