Inference with vLLM
#3
by
xxrjun
- opened
錯誤描述
原本使用 vllm serve
會出現
vllm serve RuntimeError: Error in model execution (input dumped to /tmp/err_execute_model_input_20250215-015825.pkl): The size of tensor a (5120) must match the size of tensor b (4) at non-singleton dimension 0
若使用以下程式碼則會出現 KeyError: '<IMG_CONTEXT>'
錯誤
from vllm import LLM
model = LLM(model_name, trust_remote_code=True)
個人解決方法
Build vLLM from source,新版本已支援 InternVLChatModel
git clone https://github.com/vllm-project/vllm.git cd vllm VLLM_USE_PRECOMPILED=1 pip install --editable .
手動添加 special token 至 tokenizer
from vllm import LLM from transformers import AutoTokenizer model_path = "MediaTek-Research/Llama-Breeze2-8B-Instruct-v0_1" # save tokenizer into disk tokenizer = AutoTokenizer.from_pretrained("MediaTek-Research/Llama-Breeze2-8B-Instruct-v0_1") tokenizer.add_special_tokens({"additional_special_tokens": ["<IMG_CONTEXT>"]}) tokenizer.save_pretrained("tokenizer")
兩個錯誤便會消除
測試 Instruction Following
from vllm import LLM
from mtkresearch.llm.prompt import MRPromptV3
from transformers import AutoTokenizer
model_path = "MediaTek-Research/Llama-Breeze2-8B-Instruct-v0_1"
tokenzier_path = "tokenizer"
model = LLM(model_path, trust_remote_code=True, dtype="float16", tokenizer=tokenzier_path, tensor_parallel_size=8)
outputs = llm.generate("請問什麼是深度學習?")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
輸出成功
Processed prompts: 100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 3.70it/s, est. speed input: 40.77 toks/s, output: 59.29 toks/s]
Prompt: '請問什麼是深度學習?', Generated text: '深度學習是什麼?\n深度學習是人工智'
完整程式碼
備註:Visual Instruction Following
的輸出有點奇怪,我的使用方法可能有問題
import PIL
from vllm import LLM, SamplingParams
from mtkresearch.llm.prompt import MRPromptV3
# from transformers import GenerationConfig, AutoTokenizer
model_name = "MediaTek-Research/Llama-Breeze2-8B-Instruct-v0_1"
image_path = "./image.png" # replace with your image path
# =============================================================================
# save tokenizer into disk
# =============================================================================
# tokenizer = AutoTokenizer.from_pretrained("MediaTek-Research/Llama-Breeze2-8B-Instruct-v0_1")
# tokenizer.add_special_tokens({"additional_special_tokens": ["<IMG_CONTEXT>"]})
# tokenizer.save_pretrained("tokenizer")
# =============================================================================
# Load model and tokenizer
# =============================================================================
tokenzier_path = "tokenizer" # load updated tokenizer
llm = LLM(
model_name,
trust_remote_code=True,
dtype="float16",
tokenizer=tokenzier_path,
tensor_parallel_size=8,
limit_mm_per_prompt={"image": 1},
)
prompt_engine = MRPromptV3()
sys_prompt = "You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan."
generation_config = SamplingParams(
max_tokens=2048,
temperature=0.01,
top_p=0.01,
repetition_penalty=1.1,
stop_token_ids=[128001],
)
def _inference(tokenizer, model, generation_config, prompt, pixel_values=None):
if pixel_values is None:
# ref: https://huggingface.co/MediaTek-Research/Llama-Breeze2-8B-Instruct#installation
outputs = model.generate(prompt, sampling_params=generation_config)
else:
print("image_path", image_path)
print("prompt", prompt)
# ref: https://docs.vllm.ai/en/latest/serving/multimodal_inputs.html#image
outputs = model.generate(
{
"prompt": prompt,
"pixel_values": pixel_values,
"sampling_params": generation_config,
}
)
print("outputs", outputs)
for output in outputs:
print(output.outputs[0].text)
# =============================================================================
# Instruction Following
# =============================================================================
conversations = [
{"role": "system", "content": sys_prompt},
{"role": "user", "content": "請問什麼是深度學習?"},
]
prompt = prompt_engine.get_prompt(conversations)
_inference(llm.get_tokenizer(), llm, generation_config, prompt)
# {'role': 'assistant', 'content': '深度學習是一種人工智慧技術,主要是透過模仿生物神經網路的結構和功能來實現。它利用大量數據進行訓練,以建立複雜的模型並使其能夠自主學習、預測或分類輸入資料。\n\n在深度學習中,通常使用多層的神經網路,每一層都包含許多相互連接的節點(稱為神經元)。這些神經元可以處理不同特徵的輸入資料,並將結果傳遞給下一層的神經元。隨著資料流向更高層次,這個過程逐漸捕捉到更抽象的概念或模式。\n\n深度學習已被廣泛應用於各種領域,如圖像識別、自然語言處理、語音識別以及遊戲等。它提供了比傳統機器學習方法更好的表現,因為它能夠從複雜且非線性的數據中提取出有用的資訊。'}
# =============================================================================
# Visual Instruction Following
# The output seems a little bit weird...
# =============================================================================
conversations = [
{"role": "system", "content": sys_prompt},
{
"role": "user",
"content": [
{
"type": "image",
"image_path": image_path,
},
{"type": "text", "text": "請問前三名總共可獲得多少錢?"},
],
},
]
prompt, pixel_values = prompt_engine.get_prompt(conversations)
_inference(llm.get_tokenizer(), llm, generation_config, prompt, pixel_values)
# {'role': 'assistant', 'content': '第一名可獲得30萬元,第二名可獲得20萬元,第三名可獲得15萬元。前三名總共可獲得65萬元。'}