nm-testing/llama2.c-stories15M-ultrachat-mixed-compressed

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.obcq import SparseGPTModifier
from llmcompressor.transformers import oneshot

# Select model and load it.
#MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_ID = "nm-testing/llama2.c-stories15M"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
DATASET_ID = "ultrachat_200k"

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048

# Configure the quantization algorithm to run.
#   * quantize the weights to 4 bit with GPTQ with a group size 128
from compressed_tensors.quantization import QuantizationArgs, QuantizationType, QuantizationStrategy
recipe = [
    GPTQModifier(
        config_groups=dict(group_0=dict(
            targets=[
                r"re:model.layers.0.self_attn.q_proj",
                r"re:model.layers.0.self_attn.k_proj",
                r"re:model.layers.0.self_attn.v_proj",
                r"re:model.layers.0.self_attn.o_proj",
                r"re:model.layers.1.self_attn.q_proj",
                r"re:model.layers.1.self_attn.k_proj",
                r"re:model.layers.1.self_attn.v_proj",
                r"re:model.layers.1.self_attn.o_proj",
            ],
            weights=QuantizationArgs(
                num_bits=4,
                type=QuantizationType.INT,
                strategy=QuantizationStrategy.TENSOR,
                symmetric=True,
                dynamic=False,
            ignore=["lm_head"]
            ),
        )),
    ),
    SparseGPTModifier(
        sparsity=0.5,
        #mask_structure="2:4",
        sequential_update=True,
        targets=[
            r"re:model.layers.1.self_attn.q_proj",
            r"re:model.layers.1.self_attn.k_proj",
            r"re:model.layers.1.self_attn.v_proj",
            r"re:model.layers.1.self_attn.o_proj",
            r"re:model.layers.2.self_attn.q_proj",
            r"re:model.layers.2.self_attn.k_proj",
            r"re:model.layers.2.self_attn.v_proj",
            r"re:model.layers.2.self_attn.o_proj",
        ],
    ),
]
breakpoint()

# Apply algorithms.
oneshot(
    model=model,
    dataset=DATASET_ID,
    splits={"calibration": f"train_sft[:{MAX_SEQUENCE_LENGTH}]"},
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128-unc"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)