Llama-CoreML
This repo contains Llama-3.2-1B-Instruct converted to CoreML.
Optimizations
- Fused SDPA
- KV cache + flexible inputs
- (optional) Block-wise int4 weight quantization
- Skip LM head during prefill
Example Usage
import CoreML
import Tokenizers
let prompt = "What's the capital of Ireland?"
print("Loading…")
let model = try await Llama_3_2_1B_Instruct.load()
let tokenizer = try await AutoTokenizer.from(pretrained: "meta-llama/Llama-3.2-1B-Instruct")
let kvCache = model.makeState()
print("Tokenizing…")
var tokens = try tokenizer.applyChatTemplate(messages: [[
"role": "user", "content": prompt
]])
print("Predicting…")
var prefill = true
while tokens.last != tokenizer.eosTokenId {
let inputIDs: MLShapedArray<Int32>
var causalMask: MLShapedArray<Float16>
if prefill {
inputIDs = MLShapedArray(scalars: tokens.map(Int32.init), shape: [1, tokens.count])
causalMask = MLShapedArray<Float16>(repeating: 0, shape: [1, 1, tokens.count, tokens.count])
for i in 0..<tokens.count {
for j in (i + 1)..<tokens.count {
causalMask[0][0][i][scalarAt: j] = -.infinity
}
}
prefill = false
} else {
inputIDs = MLShapedArray(scalars: [Int32(tokens.last!)], shape: [1, 1])
causalMask = MLShapedArray(repeating: 0, shape: [1, 1, 1, tokens.count])
}
let input = Llama_3_2_1B_InstructInput(
input_ids: inputIDs,
causal_mask: causalMask
)
let output = try await model.prediction(input: input, using: kvCache)
let predictedTokenID = Int(await MLTensor(output.logitsShapedArray).argmax().shapedArray(of: Int32.self).scalar!)
tokens.append(predictedTokenID)
print(tokenizer.decode(tokens: [predictedTokenID]), terminator: "")
}
// The capital of Ireland is Dublin.<|eot_id|>
Conversion
uv run https://hf.co/finnvoorhees/Llama-CoreML/raw/main/convert.py
usage: convert.py [-h] [--model MODEL] [--hf-token HF_TOKEN] [--quantize] [--half]
Convert Llama to CoreML
options:
-h, --help show this help message and exit
--model MODEL Model ID
--hf-token HF_TOKEN Hugging Face API token
--quantize Linear quantize model
--half Load model as float16
- Downloads last month
- 4
Model tree for finnvoorhees/Llama-CoreML
Base model
meta-llama/Llama-3.2-1B-Instruct