[Cache Request] aws-neuron/Mistral-7B-Instruct-v0.2-seqlen-2048-bs-1-cores-2
#112
by
gubaruch
- opened
Please add the following model to the neuron cache
There is a cacehd version of the same model, but with a sequence length of 4096. You can deploy it on SageMaker using the following code snippet:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
try:
role = sagemaker.get_execution_role()
except ValueError:
iam = boto3.client("iam")
role = iam.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]
# Hub Model configuration. https://huggingface.co/models
hub = {
"HF_MODEL_ID": "mistralai/Mistral-7B-Instruct-v0.2",
"HF_NUM_CORES": "2",
"HF_AUTO_CAST_TYPE": "fp16",
"MAX_BATCH_SIZE": "1",
"MAX_INPUT_LENGTH": "3686",
"MAX_TOTAL_TOKENS": "4096",
"HF_TOKEN": "<REPLACE WITH YOUR TOKEN>",
}
assert hub["HF_TOKEN"] != "<REPLACE WITH YOUR TOKEN>", "Please replace '<REPLACE WITH YOUR TOKEN>' with your Hugging Face Hub API token"
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
image_uri=get_huggingface_llm_image_uri("huggingface-neuronx", version="0.0.23"),
env=hub,
role=role,
)
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
initial_instance_count=1,
instance_type="ml.inf2.xlarge",
container_startup_health_check_timeout=1800,
volume_size=512,
)
# send request
predictor.predict(
{
"inputs": "What is is the capital of France?",
"parameters": {
"do_sample": True,
"max_new_tokens": 128,
"temperature": 0.7,
"top_k": 50,
"top_p": 0.95,
}
}
)