YAML Metadata
Warning:
empty or missing yaml metadata in repo card
(https://huggingface.co/docs/hub/model-cards#model-card-metadata)
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.obcq import SparseGPTModifier
from llmcompressor.transformers import oneshot
# Select model and load it.
#MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_ID = "nm-testing/llama2.c-stories15M"
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Select calibration dataset.
DATASET_ID = "ultrachat_200k"
# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048
# Configure the quantization algorithm to run.
# * quantize the weights to 4 bit with GPTQ with a group size 128
from compressed_tensors.quantization import QuantizationArgs, QuantizationType, QuantizationStrategy
recipe = [
GPTQModifier(
config_groups=dict(group_0=dict(
targets=[
r"re:model.layers.0.self_attn.q_proj",
r"re:model.layers.0.self_attn.k_proj",
r"re:model.layers.0.self_attn.v_proj",
r"re:model.layers.0.self_attn.o_proj",
r"re:model.layers.1.self_attn.q_proj",
r"re:model.layers.1.self_attn.k_proj",
r"re:model.layers.1.self_attn.v_proj",
r"re:model.layers.1.self_attn.o_proj",
],
weights=QuantizationArgs(
num_bits=4,
type=QuantizationType.INT,
strategy=QuantizationStrategy.TENSOR,
symmetric=True,
dynamic=False,
ignore=["lm_head"]
),
)),
),
SparseGPTModifier(
sparsity=0.5,
#mask_structure="2:4",
sequential_update=True,
targets=[
r"re:model.layers.1.self_attn.q_proj",
r"re:model.layers.1.self_attn.k_proj",
r"re:model.layers.1.self_attn.v_proj",
r"re:model.layers.1.self_attn.o_proj",
r"re:model.layers.2.self_attn.q_proj",
r"re:model.layers.2.self_attn.k_proj",
r"re:model.layers.2.self_attn.v_proj",
r"re:model.layers.2.self_attn.o_proj",
],
),
]
breakpoint()
# Apply algorithms.
oneshot(
model=model,
dataset=DATASET_ID,
splits={"calibration": f"train_sft[:{MAX_SEQUENCE_LENGTH}]"},
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")
# Save to disk compressed.
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128-unc"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
- Downloads last month
- 474
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
馃檵
Ask for provider support