saheedniyi commited on
Commit
16e09d6
·
verified ·
1 Parent(s): e27a276

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +16 -28
README.md CHANGED
@@ -44,14 +44,10 @@ The model can generate audio on its own but its better to use a voice to prompt
44
 
45
  ### Prompt YarnGPT2b
46
  ```python
47
- # clone the YarnGPT repo to get access to the `audiotokenizer`
48
  !git clone https://github.com/saheedniyi02/yarngpt.git
49
 
 
50
 
51
- # install some necessary libraries
52
- !pip install outetts==0.2.3 uroman
53
-
54
- #import some important packages
55
  import os
56
  import re
57
  import json
@@ -64,55 +60,47 @@ import torchaudio
64
  import IPython
65
  from transformers import AutoModelForCausalLM, AutoTokenizer
66
  from outetts.wav_tokenizer.decoder import WavTokenizer
67
- from yarngpt.audiotokenizer import AudioTokenizer
68
 
69
 
70
- # download the wavtokenizer weights and config (to encode and decode the audio)
71
  !wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
72
  !wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
73
 
74
- # model path and wavtokenizer weight path (the paths are assumed based on Google colab, a different environment might save the weights to a different location).
75
- hf_path="saheedniyi/YarnGPT"
 
 
76
  wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
77
  wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
78
 
79
- # create the AudioTokenizer object
80
- audio_tokenizer=AudioTokenizer(
81
- hf_path,wav_tokenizer_model_path,wav_tokenizer_config_path
82
- )
83
 
84
- #load the model weights
 
 
 
85
 
86
- model = AutoModelForCausalLM.from_pretrained(hf_path,torch_dtype="auto").to(audio_tokenizer.device)
87
 
88
- # your input text
89
- text="Uhm, so, what was the inspiration behind your latest project? Like, was there a specific moment where you were like, 'Yeah, this is it!' Or, you know, did it just kind of, uh, come together naturally over time?"
90
 
91
- # creating a prompt, when creating a prompt, there is an optional `speaker_name` parameter, the possible speakers are "idera","emma","jude","osagie","tayo","zainab","joke","regina","remi","umar","chinenye" if no speaker is selected a speaker is chosen at random
92
- prompt=audio_tokenizer.create_prompt(text,"idera")
93
 
94
- # tokenize the prompt
95
  input_ids=audio_tokenizer.tokenize_prompt(prompt)
96
 
97
- # generate output from the model, you can tune the `.generate` parameters as you wish
98
  output = model.generate(
99
  input_ids=input_ids,
100
  temperature=0.1,
101
  repetition_penalty=1.1,
102
  max_length=4000,
 
103
  )
104
 
105
- # convert the output to "audio codes"
106
  codes=audio_tokenizer.get_codes(output)
107
-
108
- # converts the codes to audio
109
  audio=audio_tokenizer.get_audio(codes)
110
-
111
- # play the audio
112
  IPython.display.Audio(audio,rate=24000)
 
113
 
114
- # save the audio
115
- torchaudio.save(f"audio.wav", audio, sample_rate=24000)
116
  ```
117
 
118
  ### Simple Nigerian Accented-NewsReader
 
44
 
45
  ### Prompt YarnGPT2b
46
  ```python
 
47
  !git clone https://github.com/saheedniyi02/yarngpt.git
48
 
49
+ pip install outetts uroman
50
 
 
 
 
 
51
  import os
52
  import re
53
  import json
 
60
  import IPython
61
  from transformers import AutoModelForCausalLM, AutoTokenizer
62
  from outetts.wav_tokenizer.decoder import WavTokenizer
 
63
 
64
 
 
65
  !wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
66
  !wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
67
 
68
+
69
+ from yarngpt.audiotokenizer import AudioTokenizerV2
70
+
71
+ tokenizer_path="saheedniyi/YarnGPT2b"
72
  wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
73
  wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
74
 
 
 
 
 
75
 
76
+ audio_tokenizer=AudioTokenizerV2(
77
+ tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path
78
+ )
79
+
80
 
81
+ model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
82
 
83
+ #change the text
84
+ text="The election was won by businessman and politician, Moshood Abiola, but Babangida annulled the results, citing concerns over national security."
85
 
86
+ # change the language and voice
87
+ prompt=audio_tokenizer.create_prompt(text,lang="english",speaker_name="idera")
88
 
 
89
  input_ids=audio_tokenizer.tokenize_prompt(prompt)
90
 
 
91
  output = model.generate(
92
  input_ids=input_ids,
93
  temperature=0.1,
94
  repetition_penalty=1.1,
95
  max_length=4000,
96
+ #num_beams=5,# using a beam size helps for the local languages but not english
97
  )
98
 
 
99
  codes=audio_tokenizer.get_codes(output)
 
 
100
  audio=audio_tokenizer.get_audio(codes)
 
 
101
  IPython.display.Audio(audio,rate=24000)
102
+ torchaudio.save(f"Sample.wav", audio, sample_rate=24000)
103
 
 
 
104
  ```
105
 
106
  ### Simple Nigerian Accented-NewsReader