YingxuHe commited on
Commit
d5c5e9f
·
1 Parent(s): 98192a8

update readme

Browse files
Files changed (1) hide show
  1. README.md +33 -16
README.md CHANGED
@@ -413,10 +413,13 @@ Here we provide a code snippet illustrating the process of loading both the proc
413
  > [!WARNING]
414
  > **Out of Scope use**: This model is not intended for use in tool calling, math, and coding tasks.
415
 
416
- ### Inference
 
 
 
417
 
418
  ```python
419
- from datasets import load_dataset
420
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
421
 
422
  repo_id = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
@@ -432,9 +435,12 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
432
  )
433
 
434
  prompt = "Given the following audio context: <SpeechHere>\n\nText instruction: {query}"
435
- query = "Please transcribe this speech."
 
 
436
  conversation = [
437
- {"role": "user", "content": prompt.format(query=query)}
 
438
  ]
439
 
440
  chat_prompt = processor.tokenizer.apply_chat_template(
@@ -443,24 +449,25 @@ chat_prompt = processor.tokenizer.apply_chat_template(
443
  add_generation_prompt=True
444
  )
445
 
446
- libri_data = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
447
- audio_array = libri_data[0]["audio"]["array"]
 
448
  inputs = processor(text=chat_prompt, audios=audio_array)
449
 
450
- outputs = model.generate(**inputs, max_new_tokens=256)
451
  generated_ids = outputs[:, inputs['input_ids'].size(1):]
452
- response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
453
  ```
454
 
455
- ### Batch Inference
456
-
457
- MERaLiON-AudioLLM also supports batch inference.
458
 
459
  ```python
460
- from datasets import load_dataset
 
461
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
462
 
463
  repo_id = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
 
464
 
465
  processor = AutoProcessor.from_pretrained(
466
  repo_id,
@@ -470,7 +477,9 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
470
  repo_id,
471
  use_safetensors=True,
472
  trust_remote_code=True,
473
- )
 
 
474
 
475
  prompt = "Given the following audio context: <SpeechHere>\n\nText instruction: {query}"
476
  transcribe_query = "Please transcribe this speech."
@@ -487,11 +496,19 @@ chat_prompt = processor.tokenizer.apply_chat_template(
487
  add_generation_prompt=True
488
  )
489
 
490
- libri_data = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
491
- audio_array = [libri_data[0]["audio"]["array"]]*2
 
492
  inputs = processor(text=chat_prompt, audios=audio_array)
493
 
494
- outputs = model.generate(**inputs, max_new_tokens=256)
 
 
 
 
 
 
 
495
  generated_ids = outputs[:, inputs['input_ids'].size(1):]
496
  response = processor.batch_decode(generated_ids, skip_special_tokens=True)
497
  ```
 
413
  > [!WARNING]
414
  > **Out of Scope use**: This model is not intended for use in tool calling, math, and coding tasks.
415
 
416
+
417
+ ### CPU Inference
418
+
419
+ MERaLiON-AudioLLM also supports batch inference.
420
 
421
  ```python
422
+ import librosa
423
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
424
 
425
  repo_id = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
 
435
  )
436
 
437
  prompt = "Given the following audio context: <SpeechHere>\n\nText instruction: {query}"
438
+ transcribe_query = "Please transcribe this speech."
439
+ translate_query = "Can you please translate this speech into written Chinese?"
440
+
441
  conversation = [
442
+ [{"role": "user", "content": prompt.format(query=transcribe_query)}],
443
+ [{"role": "user", "content": prompt.format(query=translate_query)}],
444
  ]
445
 
446
  chat_prompt = processor.tokenizer.apply_chat_template(
 
449
  add_generation_prompt=True
450
  )
451
 
452
+ # Use an audio within 30 seconds, 16000hz.
453
+ audio_array, sample_rate = librosa.load("/path/to/your/audio/file", sr=16000)
454
+ audio_array = [audio_array]*2
455
  inputs = processor(text=chat_prompt, audios=audio_array)
456
 
457
+ outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.1, repetition_penalty=1.1, top_p=0.9, no_repeat_ngram_size=6)
458
  generated_ids = outputs[:, inputs['input_ids'].size(1):]
459
+ response = processor.batch_decode(generated_ids, skip_special_tokens=True)
460
  ```
461
 
462
+ ### GPU Inference
 
 
463
 
464
  ```python
465
+ import torch
466
+ import librosa
467
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
468
 
469
  repo_id = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
470
+ device = "cuda"
471
 
472
  processor = AutoProcessor.from_pretrained(
473
  repo_id,
 
477
  repo_id,
478
  use_safetensors=True,
479
  trust_remote_code=True,
480
+ attn_implementation="flash_attention_2",
481
+ torch_dtype=torch.bfloat16
482
+ ).to(device)
483
 
484
  prompt = "Given the following audio context: <SpeechHere>\n\nText instruction: {query}"
485
  transcribe_query = "Please transcribe this speech."
 
496
  add_generation_prompt=True
497
  )
498
 
499
+ # Use an audio within 30 seconds, 16000hz.
500
+ audio_array, sample_rate = librosa.load("/path/to/your/audio/file", sr=16000)
501
+ audio_array = [audio_array]*2
502
  inputs = processor(text=chat_prompt, audios=audio_array)
503
 
504
+ for key, value in inputs.items():
505
+ if isinstance(value, torch.Tensor):
506
+ inputs[key] = inputs[key].to(device)
507
+
508
+ if value.dtype == torch.float32:
509
+ inputs[key] = inputs[key].to(torch.bfloat16)
510
+
511
+ outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.1, repetition_penalty=1.1, top_p=0.9, no_repeat_ngram_size=6)
512
  generated_ids = outputs[:, inputs['input_ids'].size(1):]
513
  response = processor.batch_decode(generated_ids, skip_special_tokens=True)
514
  ```