Llama-CoreML

This repo contains Llama-3.2-1B-Instruct converted to CoreML.

Optimizations

Fused SDPA
KV cache + flexible inputs
(optional) Block-wise int4 weight quantization
Skip LM head during prefill

Example Usage

import CoreML
import Tokenizers

let prompt = "What's the capital of Ireland?"

print("Loading…")
let model = try await Llama_3_2_1B_Instruct.load()
let tokenizer = try await AutoTokenizer.from(pretrained: "meta-llama/Llama-3.2-1B-Instruct")
let kvCache = model.makeState()

print("Tokenizing…")
var tokens = try tokenizer.applyChatTemplate(messages: [[
    "role": "user", "content": prompt
]])

print("Predicting…")
var prefill = true
while tokens.last != tokenizer.eosTokenId {
    let inputIDs: MLShapedArray<Int32>
    var causalMask: MLShapedArray<Float16>
    if prefill {
        inputIDs = MLShapedArray(scalars: tokens.map(Int32.init), shape: [1, tokens.count])
        causalMask = MLShapedArray<Float16>(repeating: 0, shape: [1, 1, tokens.count, tokens.count])
        for i in 0..<tokens.count {
            for j in (i + 1)..<tokens.count {
                causalMask[0][0][i][scalarAt: j] = -.infinity
            }
        }
        prefill = false
    } else {
        inputIDs = MLShapedArray(scalars: [Int32(tokens.last!)], shape: [1, 1])
        causalMask = MLShapedArray(repeating: 0, shape: [1, 1, 1, tokens.count])
    }

    let input = Llama_3_2_1B_InstructInput(
        input_ids: inputIDs,
        causal_mask: causalMask
    )
    let output = try await model.prediction(input: input, using: kvCache)
    let predictedTokenID = Int(await MLTensor(output.logitsShapedArray).argmax().shapedArray(of: Int32.self).scalar!)
    tokens.append(predictedTokenID)
    print(tokenizer.decode(tokens: [predictedTokenID]), terminator: "")
}
// The capital of Ireland is Dublin.<|eot_id|>

Conversion

uv run https://hf.co/finnvoorhees/Llama-CoreML/raw/main/convert.py

usage: convert.py [-h] [--model MODEL] [--hf-token HF_TOKEN] [--quantize] [--half]

Convert Llama to CoreML

options:
  -h, --help           show this help message and exit
  --model MODEL        Model ID
  --hf-token HF_TOKEN  Hugging Face API token
  --quantize           Linear quantize model
  --half               Load model as float16

finnvoorhees
/

Llama-CoreML

Llama-CoreML

Optimizations

Example Usage

Conversion

Model tree for finnvoorhees/Llama-CoreML