MERaLiON
/

MERaLiON-AudioLLM-Whisper-SEA-LION

@@ -413,10 +413,13 @@ Here we provide a code snippet illustrating the process of loading both the proc
 > [!WARNING]
 > **Out of Scope use**: This model is not intended for use in tool calling, math, and coding tasks.
-### Inference
 ```python
-from datasets import load_dataset
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
 repo_id = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
@@ -432,9 +435,12 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
 )
 prompt = "Given the following audio context: <SpeechHere>\n\nText instruction: {query}"
-query = "Please transcribe this speech."
 conversation = [
-    {"role": "user", "content": prompt.format(query=query)}
 ]
 chat_prompt = processor.tokenizer.apply_chat_template(
@@ -443,24 +449,25 @@ chat_prompt = processor.tokenizer.apply_chat_template(
     add_generation_prompt=True
 )
-libri_data = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
-audio_array = libri_data[0]["audio"]["array"]
 inputs = processor(text=chat_prompt, audios=audio_array)
-outputs = model.generate(**inputs, max_new_tokens=256)
 generated_ids = outputs[:, inputs['input_ids'].size(1):]
-response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 ```
-### Batch Inference
-MERaLiON-AudioLLM also supports batch inference.
 ```python
-from datasets import load_dataset
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
 repo_id = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
 processor = AutoProcessor.from_pretrained(
     repo_id,
@@ -470,7 +477,9 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
     repo_id,
     use_safetensors=True,
     trust_remote_code=True,
-)
 prompt = "Given the following audio context: <SpeechHere>\n\nText instruction: {query}"
 transcribe_query = "Please transcribe this speech."
@@ -487,11 +496,19 @@ chat_prompt = processor.tokenizer.apply_chat_template(
     add_generation_prompt=True
 )
-libri_data = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
-audio_array = [libri_data[0]["audio"]["array"]]*2
 inputs = processor(text=chat_prompt, audios=audio_array)
-outputs = model.generate(**inputs, max_new_tokens=256)
 generated_ids = outputs[:, inputs['input_ids'].size(1):]
 response = processor.batch_decode(generated_ids, skip_special_tokens=True)
 ```

 > [!WARNING]
 > **Out of Scope use**: This model is not intended for use in tool calling, math, and coding tasks.
+### CPU Inference
+MERaLiON-AudioLLM also supports batch inference.
 ```python
+import librosa
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
 repo_id = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
 )
 prompt = "Given the following audio context: <SpeechHere>\n\nText instruction: {query}"
+transcribe_query = "Please transcribe this speech."
+translate_query = "Can you please translate this speech into written Chinese?"
 conversation = [
+    [{"role": "user", "content": prompt.format(query=transcribe_query)}],
+    [{"role": "user", "content": prompt.format(query=translate_query)}],
 ]
 chat_prompt = processor.tokenizer.apply_chat_template(
     add_generation_prompt=True
 )
+# Use an audio within 30 seconds, 16000hz.
+audio_array, sample_rate = librosa.load("/path/to/your/audio/file", sr=16000)
+audio_array = [audio_array]*2
 inputs = processor(text=chat_prompt, audios=audio_array)
+outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.1, repetition_penalty=1.1, top_p=0.9, no_repeat_ngram_size=6)
 generated_ids = outputs[:, inputs['input_ids'].size(1):]
+response = processor.batch_decode(generated_ids, skip_special_tokens=True)
 ```
+### GPU Inference
 ```python
+import torch
+import librosa
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
 repo_id = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
+device = "cuda"
 processor = AutoProcessor.from_pretrained(
     repo_id,
     repo_id,
     use_safetensors=True,
     trust_remote_code=True,
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.bfloat16
+).to(device)
 prompt = "Given the following audio context: <SpeechHere>\n\nText instruction: {query}"
 transcribe_query = "Please transcribe this speech."
     add_generation_prompt=True
 )
+# Use an audio within 30 seconds, 16000hz.
+audio_array, sample_rate = librosa.load("/path/to/your/audio/file", sr=16000)
+audio_array = [audio_array]*2
 inputs = processor(text=chat_prompt, audios=audio_array)
+for key, value in inputs.items():
+    if isinstance(value, torch.Tensor):
+        inputs[key] = inputs[key].to(device)
+        if value.dtype == torch.float32:
+            inputs[key] = inputs[key].to(torch.bfloat16)
+outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.1, repetition_penalty=1.1, top_p=0.9, no_repeat_ngram_size=6)
 generated_ids = outputs[:, inputs['input_ids'].size(1):]
 response = processor.batch_decode(generated_ids, skip_special_tokens=True)
 ```