update vllm serving guide

Browse files

Files changed (5) hide show

README.md +13 -10
vllm_plugin_meralion/README.md +115 -0
vllm_plugin_meralion/set_up.py +1 -1
vllm_plugin_meralion/vllm_plugin_meralion/modeling_text_decoder.py +1 -1
vllm_plugin_meralion/vllm_plugin_meralion/vllm_meralion.py +42 -12

README.md CHANGED Viewed

@@ -446,7 +446,7 @@ libri_data = load_dataset("distil-whisper/librispeech_long", "clean", split="val
 audio_array = libri_data[0]["audio"]["array"]
 inputs = processor(text=chat_prompt, audios=audio_array)
-outputs = model.generate(**inputs, max_new_tokens=128)
 generated_ids = outputs[:, inputs['input_ids'].size(1):]
 response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 ```
@@ -490,7 +490,7 @@ libri_data = load_dataset("distil-whisper/librispeech_long", "clean", split="val
 audio_array = [libri_data[0]["audio"]["array"]]*2
 inputs = processor(text=chat_prompt, audios=audio_array)
-outputs = model.generate(**inputs, max_new_tokens=128)
 generated_ids = outputs[:, inputs['input_ids'].size(1):]
 response = processor.batch_decode(generated_ids, skip_special_tokens=True)
 ```
@@ -527,9 +527,7 @@ def run_meralion(question: str):
     llm = LLM(model=model_name,
               tokenizer=model_name,
-              tokenizer_mode="slow",
-              max_model_len=4096,
-              max_num_seqs=5,
               limit_mm_per_prompt={"audio": 1},
               trust_remote_code=True,
               dtype=torch.bfloat16
@@ -550,9 +548,15 @@ llm, prompt, stop_token_ids = run_meralion(question)
 # We set temperature to 0.2 so that outputs can be different
 # even when all prompts are identical when running batch inference.
-sampling_params = SamplingParams(temperature=0.2,
-                                    max_tokens=64,
-                                    stop_token_ids=stop_token_ids)
 mm_data = {"audio": [audio_asset.audio_and_sample_rate]}
 inputs = {"prompt": prompt, "multi_modal_data": mm_data}
@@ -569,7 +573,6 @@ for o in outputs:
 #### OpenAI Compatible Server
 **server**
 Here is an example to start the server via the `vllm serve` command.
@@ -577,7 +580,7 @@ Here is an example to start the server via the `vllm serve` command.
 ```bash
 export HF_TOKEN=your-hf-token
-vllm serve MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION --tokenizer MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION --tokenizer-mode slow --max-num-seqs 8 --trust-remote-code --dtype bfloat16
 ```
 **client**

 audio_array = libri_data[0]["audio"]["array"]
 inputs = processor(text=chat_prompt, audios=audio_array)
+outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.1, top_p=0.9, repetition_penalty=1.1)
 generated_ids = outputs[:, inputs['input_ids'].size(1):]
 response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 ```
 audio_array = [libri_data[0]["audio"]["array"]]*2
 inputs = processor(text=chat_prompt, audios=audio_array)
+outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.1, top_p=0.9, repetition_penalty=1.1)
 generated_ids = outputs[:, inputs['input_ids'].size(1):]
 response = processor.batch_decode(generated_ids, skip_special_tokens=True)
 ```
     llm = LLM(model=model_name,
               tokenizer=model_name,
+              max_num_seqs=8,
               limit_mm_per_prompt={"audio": 1},
               trust_remote_code=True,
               dtype=torch.bfloat16
 # We set temperature to 0.2 so that outputs can be different
 # even when all prompts are identical when running batch inference.
+sampling_params = SamplingParams(
+  temperature=0.1,
+  top_p=0.9,
+  top_k=50,
+  repetition_penalty=1.1,
+  seed=42,
+  max_tokens=1024,
+  stop_token_ids=None
+)
 mm_data = {"audio": [audio_asset.audio_and_sample_rate]}
 inputs = {"prompt": prompt, "multi_modal_data": mm_data}
 #### OpenAI Compatible Server
 **server**
 Here is an example to start the server via the `vllm serve` command.
 ```bash
 export HF_TOKEN=your-hf-token
+vllm serve MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION --tokenizer MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION --max-num-seqs 8 --trust-remote-code --dtype bfloat16
 ```
 **client**

vllm_plugin_meralion/README.md ADDED Viewed

	@@ -0,0 +1,115 @@

+## MERaLiON vLLM serving
+### Set up Environment
+MERaLiON-AudioLLM requires vLLM version `0.6.4.post1` and transformers `4.46.3`
+```bash
+pip install vllm==0.6.4.post1
+pip install transformers==4.46.3
+```
+As the [vLLM documentation](https://docs.vllm.ai/en/stable/models/adding_model.html#out-of-tree-model-integration) recommends,
+we provide a way to register our model via [vLLM plugins](https://docs.vllm.ai/en/stable/design/plugin_system.html#plugin-system).
+```bash
+python install .
+```
+### Serving
+Here is an example to start the server via the `vllm serve` command.
+```bash
+export HF_TOKEN=<your-hf-token>
+vllm serve MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION --tokenizer MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION --max-num-seqs 8 --trust-remote-code --dtype bfloat16 --port 8000
+```
+To call the server, you can use the [official OpenAI client](https://github.com/openai/openai-python):
+```python
+import base64
+from openai import OpenAI
+def get_client(api_key="EMPTY", base_url="http://localhost:8000/v1"):
+    client = OpenAI(
+        api_key=api_key,
+        base_url=base_url,
+    )
+    models = client.models.list()
+    model_name = models.data[0].id
+    return client, model_name
+def get_response(text_input, base64_audio_input, **params):
+    response_obj = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"Text instruction: {text_input}"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": f"data:audio/ogg;base64,{base64_audio_input}"
+                    },
+                },
+            ],
+        }],
+        **params
+    )
+    return response_obj
+#specify input and params
+possible_text_inputs = [
+    "Please transcribe this speech.",
+    "Please summarise the content of this speech.",
+    "Please follow the instruction in this speech."
+]
+audio_bytes = open(f"/path/to/wav/or/mp3/file", "rb").read()
+audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+# use the port number of vllm service.
+client, model_name = get_client(base_url="http://localhost:8000/v1")
+generation_parameters = dict(
+    model=model_name,
+    max_completion_tokens=1024,
+    temperature=0.1,
+    top_p=0.9,
+    extra_body={
+        "repetition_penalty": 1.1,
+        "top_k": 50,
+        "length_penalty": 1.0
+    },
+    seed=42
+)
+response_obj = get_response(possible_text_inputs[0], audio_base64, **generation_parameters)
+print(response_obj.choices[0].message.content)
+```
+Alternatively, can try calling the server with curl command.
+```bash
+curl http://localhost:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION",
+        "messages": [
+            {"role": "system", "content": [{"type": "text", "text": "Text instruction: <your-command>"}, {"type":"audio_url", "audio_url": {"url": "data:audio/ogg;base64,<audio base64>"}}]},
+        ]
+    }'
+```

vllm_plugin_meralion/set_up.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from setuptools import setup
 setup(name='vllm_plugin_meralion',
-      version='0.1',
       packages=['vllm_plugin_meralion'],
       entry_points={
           'vllm.general_plugins':

 from setuptools import setup
 setup(name='vllm_plugin_meralion',
+      version='0.2',
       packages=['vllm_plugin_meralion'],
       entry_points={
           'vllm.general_plugins':

vllm_plugin_meralion/vllm_plugin_meralion/modeling_text_decoder.py CHANGED Viewed

@@ -1316,4 +1316,4 @@ class MERaLiONTextForTokenClassification(MERaLiONTextPreTrainedModel):
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-        )

             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+        )

vllm_plugin_meralion/vllm_plugin_meralion/vllm_meralion.py CHANGED Viewed

@@ -35,6 +35,12 @@ _KEYS_TO_MODIFY_MAPPING = {
     "text_decoder.model": "text_decoder",
 }
 # === Audio Inputs === #
 class MERaLiONInputs(TypedDict):
@@ -107,9 +113,9 @@ def dummy_data_for_meralion(ctx: InputContext, seq_len: int,
         (speech_token_index, max_llm_audio_tokens),
         (0, seq_len - max_llm_audio_tokens),
     )
-    dummy_audio = np.full((max_llm_audio_tokens * 2 * 2 * 160, ), 0.)
     return DummyData(
-        dummy_seqdata, {"audio": [(dummy_audio, 16000)] * num_audios}, {
             "audio":
             consecutive_placeholder_ranges(num_items=num_audios,
                                            item_size=max_tokens_per_audio)
@@ -157,11 +163,33 @@ def get_processor(
 cached_get_processor = lru_cache(get_processor)
 def get_max_meralion_audio_tokens(ctx: InputContext) -> int:
     """
     The max number of tokens after speech audio adapter.
     """
-    return 100
 def input_processor_for_meralion(
@@ -184,26 +212,24 @@ def input_processor_for_meralion(
                          target_sr=processor.feature_extractor.sampling_rate)
         for audio, sampling_rate in audios
     ]
-    audio_input_lengths = np.array(
-        [min(3000, _.shape[0] // 160 + 1) for _ in resampled_audios])
-    audio_output_length = get_max_meralion_audio_tokens(ctx)
     speech_token_index = ctx.model_config.hf_config.speech_token_index
     input_ids = inputs['prompt_token_ids']
     new_input_ids = []
     audio_num = input_ids.count(speech_token_index)
-    assert len(audio_input_lengths) == audio_num, \
         (f'The text input contains {audio_num} audio tokens, '
-         f'but {len(audio_input_lengths)} audios provided')
     start = 0
-    for _ in range(audio_num):
         end = input_ids.index(speech_token_index, start)
         new_input_ids.extend(input_ids[start:end])  # text part
-        new_input_ids.extend([speech_token_index] * audio_output_length)
         start = end + 1
     new_input_ids.extend(input_ids[start:])
@@ -240,6 +266,9 @@ def input_mapper_for_meralion(
                 target_sr=processor.feature_extractor.sampling_rate)
             for audio, sampling_rate in multi_modal_data
         ]
         batch_data = audio_feature_extractor(resampled_audios,
                                              sampling_rate=16000,
                                              return_attention_mask=True,
@@ -291,6 +320,7 @@ class MERaLiONForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.text_config.vocab_size,
                                                 logit_scale)
         self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (

     "text_decoder.model": "text_decoder",
 }
+# === Constants === #
+DEFAULT_SAMPLE_RATE = 16000
+FEATURE_CHUNK_SIZE = DEFAULT_SAMPLE_RATE * 30
+OUTPUT_CHUNK_SIZE = 100
+MAX_NUMBER_CHUNKS = 8
 # === Audio Inputs === #
 class MERaLiONInputs(TypedDict):
         (speech_token_index, max_llm_audio_tokens),
         (0, seq_len - max_llm_audio_tokens),
     )
+    dummy_audio = np.full((max_llm_audio_tokens * 15 * 2 * 160, ), 0.)
     return DummyData(
+        dummy_seqdata, {"audio": [(dummy_audio, DEFAULT_SAMPLE_RATE)] * num_audios}, {
             "audio":
             consecutive_placeholder_ranges(num_items=num_audios,
                                            item_size=max_tokens_per_audio)
 cached_get_processor = lru_cache(get_processor)
+def _get_number_chunks(audios: List[np.ndarray]):
+    audio_lengths = np.array([_.shape[0] for _ in audios])
+    number_chunks = (audio_lengths // FEATURE_CHUNK_SIZE) + 1
+    return np.clip(number_chunks, a_min=None, a_max=MAX_NUMBER_CHUNKS)
+def _get_feat_extract_output_lengths(audios: List[np.ndarray]):
+    return _get_number_chunks(audios) * OUTPUT_CHUNK_SIZE
+def _get_chunked_audios(audios: List[np.ndarray]):
+    audio_number_chunks = _get_number_chunks(audios)
+    chunked_resampled_audios = []
+    for audio_idx, audio in enumerate(audios):
+        for cid in range(audio_number_chunks[audio_idx]):
+            chunked_resampled_audios.append(
+                audio[cid * FEATURE_CHUNK_SIZE: (cid + 1) * FEATURE_CHUNK_SIZE].copy()
+            )
+    return chunked_resampled_audios
 def get_max_meralion_audio_tokens(ctx: InputContext) -> int:
     """
     The max number of tokens after speech audio adapter.
     """
+    return MAX_NUMBER_CHUNKS * OUTPUT_CHUNK_SIZE
 def input_processor_for_meralion(
                          target_sr=processor.feature_extractor.sampling_rate)
         for audio, sampling_rate in audios
     ]
+    audio_output_lengths = _get_feat_extract_output_lengths(resampled_audios)
     speech_token_index = ctx.model_config.hf_config.speech_token_index
     input_ids = inputs['prompt_token_ids']
     new_input_ids = []
     audio_num = input_ids.count(speech_token_index)
+    assert len(audio_output_lengths) == audio_num, \
         (f'The text input contains {audio_num} audio tokens, '
+         f'but {len(audio_output_lengths)} audios provided')
     start = 0
+    for audio_idx in range(audio_num):
         end = input_ids.index(speech_token_index, start)
         new_input_ids.extend(input_ids[start:end])  # text part
+        new_input_ids.extend([speech_token_index] *
+                             audio_output_lengths[audio_idx])
         start = end + 1
     new_input_ids.extend(input_ids[start:])
                 target_sr=processor.feature_extractor.sampling_rate)
             for audio, sampling_rate in multi_modal_data
         ]
+        resampled_audios = _get_chunked_audios(resampled_audios)
         batch_data = audio_feature_extractor(resampled_audios,
                                              sampling_rate=16000,
                                              return_attention_mask=True,
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.text_config.vocab_size,
                                                 logit_scale)
         self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (