RedHatAI/llama-2-7b-gsm8k-pruned60-quant-ds

Run

from deepsparse import TextGeneration
model = TextGeneration(model="hf:mgoin/llama-2-7b-gsm8k-pruned60-quant-ds")
prompt = "James decides to run 3 sprints 3 times a week. He runs 60 meters each sprint. How many total meters does he run a week?"
print(model(prompt, max_new_tokens=100).generations[0].text)
### First find the total number of meters James runs in one sprint: 60 meters/sprint * 3 sprints = <<60*3=180>>180 meters\nThen multiply that number by the number of sprints per week to find the total number of meters he runs each week: 180 meters/sprint * 3 sprints/week = <<180*3=540>>540 meters

Quantize and Export

git clone https://github.com/neuralmagic/sparseml
pip install -e sparseml[transformers]
python sparseml/src/sparseml/transformers/sparsification/obcq/obcq.py /path/to/llama-2-7b_pruned60-gsm8k open_platypus --recipe llama-gsm8k-60p-skip5.yaml --save True
python sparseml/src/sparseml/transformers/sparsification/obcq/export.py --task text-generation --model_path obcq_deployment --sequence_length 512
cp deployment/model.onnx deployment/model-orig.onnx
python onnx_kv_inject.py --input-file deployment/model-orig.onnx --output-file deployment/model.onnx

llama-gsm8k-60p-skip5.yaml

test_stage:
  obcq_modifiers:
    SparseGPTModifier:
      sparsity: 0.0
      block_size: 128
      sequential_update: False
      quantize:
        QuantizationModifier:
          ignore:
            - LlamaRotaryEmbedding
            - LlamaRMSNorm
            - SiLUActivation
            - model.layers.1.mlp.down_proj
            - model.layers.30.mlp.down_proj
            - model.layers.31.mlp.down_proj
            - model.layers.18.mlp.down_proj
            - model.layers.29.mlp.down_proj
          post_oneshot_calibration: True
          scheme_overrides:
            Embedding:
              input_activations: null
              weights:
                num_bits: 8
                symmetric: False
      percdamp: 0.01
      prunen: 0
      prunem: 0
      targets: [
        "model.layers.0",
        "model.layers.1",
        "model.layers.2",
        "model.layers.3",
        "model.layers.4",
        "model.layers.5",
        "model.layers.6",
        "model.layers.7",
        "model.layers.8",
        "model.layers.9",
        "model.layers.10",
        "model.layers.11",
        "model.layers.12",
        "model.layers.13",
        "model.layers.14",
        "model.layers.15",
        "model.layers.16",
        "model.layers.17",
        "model.layers.18",
        "model.layers.19",
        "model.layers.20",
        "model.layers.21",
        "model.layers.22",
        "model.layers.23",
        "model.layers.24",
        "model.layers.25",
        "model.layers.26",
        "model.layers.27",
        "model.layers.28",
        "model.layers.29",
        "model.layers.30",
        "model.layers.31",
      ]
      target_ids: ["attention_mask", "position_ids"]