update readme
Browse files
README.md
CHANGED
@@ -413,10 +413,13 @@ Here we provide a code snippet illustrating the process of loading both the proc
|
|
413 |
> [!WARNING]
|
414 |
> **Out of Scope use**: This model is not intended for use in tool calling, math, and coding tasks.
|
415 |
|
416 |
-
|
|
|
|
|
|
|
417 |
|
418 |
```python
|
419 |
-
|
420 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
|
421 |
|
422 |
repo_id = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
|
@@ -432,9 +435,12 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
|
432 |
)
|
433 |
|
434 |
prompt = "Given the following audio context: <SpeechHere>\n\nText instruction: {query}"
|
435 |
-
|
|
|
|
|
436 |
conversation = [
|
437 |
-
{"role": "user", "content": prompt.format(query=
|
|
|
438 |
]
|
439 |
|
440 |
chat_prompt = processor.tokenizer.apply_chat_template(
|
@@ -443,24 +449,25 @@ chat_prompt = processor.tokenizer.apply_chat_template(
|
|
443 |
add_generation_prompt=True
|
444 |
)
|
445 |
|
446 |
-
|
447 |
-
audio_array =
|
|
|
448 |
inputs = processor(text=chat_prompt, audios=audio_array)
|
449 |
|
450 |
-
outputs = model.generate(**inputs, max_new_tokens=256)
|
451 |
generated_ids = outputs[:, inputs['input_ids'].size(1):]
|
452 |
-
response = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
453 |
```
|
454 |
|
455 |
-
###
|
456 |
-
|
457 |
-
MERaLiON-AudioLLM also supports batch inference.
|
458 |
|
459 |
```python
|
460 |
-
|
|
|
461 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
|
462 |
|
463 |
repo_id = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
|
|
|
464 |
|
465 |
processor = AutoProcessor.from_pretrained(
|
466 |
repo_id,
|
@@ -470,7 +477,9 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
|
470 |
repo_id,
|
471 |
use_safetensors=True,
|
472 |
trust_remote_code=True,
|
473 |
-
|
|
|
|
|
474 |
|
475 |
prompt = "Given the following audio context: <SpeechHere>\n\nText instruction: {query}"
|
476 |
transcribe_query = "Please transcribe this speech."
|
@@ -487,11 +496,19 @@ chat_prompt = processor.tokenizer.apply_chat_template(
|
|
487 |
add_generation_prompt=True
|
488 |
)
|
489 |
|
490 |
-
|
491 |
-
audio_array =
|
|
|
492 |
inputs = processor(text=chat_prompt, audios=audio_array)
|
493 |
|
494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
generated_ids = outputs[:, inputs['input_ids'].size(1):]
|
496 |
response = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
497 |
```
|
|
|
413 |
> [!WARNING]
|
414 |
> **Out of Scope use**: This model is not intended for use in tool calling, math, and coding tasks.
|
415 |
|
416 |
+
|
417 |
+
### CPU Inference
|
418 |
+
|
419 |
+
MERaLiON-AudioLLM also supports batch inference.
|
420 |
|
421 |
```python
|
422 |
+
import librosa
|
423 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
|
424 |
|
425 |
repo_id = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
|
|
|
435 |
)
|
436 |
|
437 |
prompt = "Given the following audio context: <SpeechHere>\n\nText instruction: {query}"
|
438 |
+
transcribe_query = "Please transcribe this speech."
|
439 |
+
translate_query = "Can you please translate this speech into written Chinese?"
|
440 |
+
|
441 |
conversation = [
|
442 |
+
[{"role": "user", "content": prompt.format(query=transcribe_query)}],
|
443 |
+
[{"role": "user", "content": prompt.format(query=translate_query)}],
|
444 |
]
|
445 |
|
446 |
chat_prompt = processor.tokenizer.apply_chat_template(
|
|
|
449 |
add_generation_prompt=True
|
450 |
)
|
451 |
|
452 |
+
# Use an audio within 30 seconds, 16000hz.
|
453 |
+
audio_array, sample_rate = librosa.load("/path/to/your/audio/file", sr=16000)
|
454 |
+
audio_array = [audio_array]*2
|
455 |
inputs = processor(text=chat_prompt, audios=audio_array)
|
456 |
|
457 |
+
outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.1, repetition_penalty=1.1, top_p=0.9, no_repeat_ngram_size=6)
|
458 |
generated_ids = outputs[:, inputs['input_ids'].size(1):]
|
459 |
+
response = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
460 |
```
|
461 |
|
462 |
+
### GPU Inference
|
|
|
|
|
463 |
|
464 |
```python
|
465 |
+
import torch
|
466 |
+
import librosa
|
467 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
|
468 |
|
469 |
repo_id = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
|
470 |
+
device = "cuda"
|
471 |
|
472 |
processor = AutoProcessor.from_pretrained(
|
473 |
repo_id,
|
|
|
477 |
repo_id,
|
478 |
use_safetensors=True,
|
479 |
trust_remote_code=True,
|
480 |
+
attn_implementation="flash_attention_2",
|
481 |
+
torch_dtype=torch.bfloat16
|
482 |
+
).to(device)
|
483 |
|
484 |
prompt = "Given the following audio context: <SpeechHere>\n\nText instruction: {query}"
|
485 |
transcribe_query = "Please transcribe this speech."
|
|
|
496 |
add_generation_prompt=True
|
497 |
)
|
498 |
|
499 |
+
# Use an audio within 30 seconds, 16000hz.
|
500 |
+
audio_array, sample_rate = librosa.load("/path/to/your/audio/file", sr=16000)
|
501 |
+
audio_array = [audio_array]*2
|
502 |
inputs = processor(text=chat_prompt, audios=audio_array)
|
503 |
|
504 |
+
for key, value in inputs.items():
|
505 |
+
if isinstance(value, torch.Tensor):
|
506 |
+
inputs[key] = inputs[key].to(device)
|
507 |
+
|
508 |
+
if value.dtype == torch.float32:
|
509 |
+
inputs[key] = inputs[key].to(torch.bfloat16)
|
510 |
+
|
511 |
+
outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.1, repetition_penalty=1.1, top_p=0.9, no_repeat_ngram_size=6)
|
512 |
generated_ids = outputs[:, inputs['input_ids'].size(1):]
|
513 |
response = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
514 |
```
|