saheedniyi
/

YarnGPT2b

@@ -44,14 +44,10 @@ The model can generate audio on its own but its better to use a voice to prompt
 ### Prompt YarnGPT2b
 ```python
-# clone the YarnGPT repo to get access to the `audiotokenizer`
 !git clone https://github.com/saheedniyi02/yarngpt.git
-# install some necessary libraries
-!pip install outetts==0.2.3 uroman
-#import some important packages
 import os
 import re
 import json
@@ -64,55 +60,47 @@ import torchaudio
 import IPython
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from outetts.wav_tokenizer.decoder import WavTokenizer
-from yarngpt.audiotokenizer import AudioTokenizer
-# download the wavtokenizer weights and config (to encode and decode the audio)
 !wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
 !wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
-# model path and wavtokenizer weight path (the paths are assumed based on Google colab, a different environment might save the weights to a different location).
-hf_path="saheedniyi/YarnGPT"
 wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
 wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
-# create the AudioTokenizer object
-audio_tokenizer=AudioTokenizer(
-    hf_path,wav_tokenizer_model_path,wav_tokenizer_config_path
-)
-#load the model weights
-model = AutoModelForCausalLM.from_pretrained(hf_path,torch_dtype="auto").to(audio_tokenizer.device)
-# your input text
-text="Uhm, so, what was the inspiration behind your latest project? Like, was there a specific moment where you were like, 'Yeah, this is it!' Or, you know, did it just kind of, uh, come together naturally over time?"
-# creating a prompt, when creating a prompt, there is an optional `speaker_name` parameter, the possible speakers are "idera","emma","jude","osagie","tayo","zainab","joke","regina","remi","umar","chinenye" if no speaker is selected a speaker is chosen at random
-prompt=audio_tokenizer.create_prompt(text,"idera")
-# tokenize the prompt
 input_ids=audio_tokenizer.tokenize_prompt(prompt)
-# generate output from the model, you can tune the `.generate` parameters as you wish
 output  = model.generate(
             input_ids=input_ids,
             temperature=0.1,
             repetition_penalty=1.1,
             max_length=4000,
         )
-# convert the output to "audio codes"
 codes=audio_tokenizer.get_codes(output)
-# converts the codes to audio
 audio=audio_tokenizer.get_audio(codes)
-# play the audio
 IPython.display.Audio(audio,rate=24000)
-# save the audio
-torchaudio.save(f"audio.wav", audio, sample_rate=24000)
 ```
 ### Simple Nigerian Accented-NewsReader

 ### Prompt YarnGPT2b
 ```python
 !git clone https://github.com/saheedniyi02/yarngpt.git
+pip install outetts uroman
 import os
 import re
 import json
 import IPython
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from outetts.wav_tokenizer.decoder import WavTokenizer
 !wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
 !wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
+from yarngpt.audiotokenizer import AudioTokenizerV2
+tokenizer_path="saheedniyi/YarnGPT2b"
 wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
 wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
+audio_tokenizer=AudioTokenizerV2(
+    tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path
+    )
+model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
+#change the text
+text="The election was won by businessman and politician, Moshood Abiola, but Babangida annulled the results, citing concerns over national security."
+# change the language and voice
+prompt=audio_tokenizer.create_prompt(text,lang="english",speaker_name="idera")
 input_ids=audio_tokenizer.tokenize_prompt(prompt)
 output  = model.generate(
             input_ids=input_ids,
             temperature=0.1,
             repetition_penalty=1.1,
             max_length=4000,
+            #num_beams=5,# using a beam size helps for the local languages but not english
         )
 codes=audio_tokenizer.get_codes(output)
 audio=audio_tokenizer.get_audio(codes)
 IPython.display.Audio(audio,rate=24000)
+torchaudio.save(f"Sample.wav", audio, sample_rate=24000)
 ```
 ### Simple Nigerian Accented-NewsReader