Update README.md
Browse files
README.md
CHANGED
@@ -44,14 +44,10 @@ The model can generate audio on its own but its better to use a voice to prompt
|
|
44 |
|
45 |
### Prompt YarnGPT2b
|
46 |
```python
|
47 |
-
# clone the YarnGPT repo to get access to the `audiotokenizer`
|
48 |
!git clone https://github.com/saheedniyi02/yarngpt.git
|
49 |
|
|
|
50 |
|
51 |
-
# install some necessary libraries
|
52 |
-
!pip install outetts==0.2.3 uroman
|
53 |
-
|
54 |
-
#import some important packages
|
55 |
import os
|
56 |
import re
|
57 |
import json
|
@@ -64,55 +60,47 @@ import torchaudio
|
|
64 |
import IPython
|
65 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
66 |
from outetts.wav_tokenizer.decoder import WavTokenizer
|
67 |
-
from yarngpt.audiotokenizer import AudioTokenizer
|
68 |
|
69 |
|
70 |
-
# download the wavtokenizer weights and config (to encode and decode the audio)
|
71 |
!wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
|
72 |
!wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
|
73 |
|
74 |
-
|
75 |
-
|
|
|
|
|
76 |
wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
|
77 |
wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
|
78 |
|
79 |
-
# create the AudioTokenizer object
|
80 |
-
audio_tokenizer=AudioTokenizer(
|
81 |
-
hf_path,wav_tokenizer_model_path,wav_tokenizer_config_path
|
82 |
-
)
|
83 |
|
84 |
-
|
|
|
|
|
|
|
85 |
|
86 |
-
model = AutoModelForCausalLM.from_pretrained(
|
87 |
|
88 |
-
#
|
89 |
-
text="
|
90 |
|
91 |
-
#
|
92 |
-
prompt=audio_tokenizer.create_prompt(text,"idera")
|
93 |
|
94 |
-
# tokenize the prompt
|
95 |
input_ids=audio_tokenizer.tokenize_prompt(prompt)
|
96 |
|
97 |
-
# generate output from the model, you can tune the `.generate` parameters as you wish
|
98 |
output = model.generate(
|
99 |
input_ids=input_ids,
|
100 |
temperature=0.1,
|
101 |
repetition_penalty=1.1,
|
102 |
max_length=4000,
|
|
|
103 |
)
|
104 |
|
105 |
-
# convert the output to "audio codes"
|
106 |
codes=audio_tokenizer.get_codes(output)
|
107 |
-
|
108 |
-
# converts the codes to audio
|
109 |
audio=audio_tokenizer.get_audio(codes)
|
110 |
-
|
111 |
-
# play the audio
|
112 |
IPython.display.Audio(audio,rate=24000)
|
|
|
113 |
|
114 |
-
# save the audio
|
115 |
-
torchaudio.save(f"audio.wav", audio, sample_rate=24000)
|
116 |
```
|
117 |
|
118 |
### Simple Nigerian Accented-NewsReader
|
|
|
44 |
|
45 |
### Prompt YarnGPT2b
|
46 |
```python
|
|
|
47 |
!git clone https://github.com/saheedniyi02/yarngpt.git
|
48 |
|
49 |
+
pip install outetts uroman
|
50 |
|
|
|
|
|
|
|
|
|
51 |
import os
|
52 |
import re
|
53 |
import json
|
|
|
60 |
import IPython
|
61 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
62 |
from outetts.wav_tokenizer.decoder import WavTokenizer
|
|
|
63 |
|
64 |
|
|
|
65 |
!wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
|
66 |
!wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
|
67 |
|
68 |
+
|
69 |
+
from yarngpt.audiotokenizer import AudioTokenizerV2
|
70 |
+
|
71 |
+
tokenizer_path="saheedniyi/YarnGPT2b"
|
72 |
wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
|
73 |
wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
|
74 |
|
|
|
|
|
|
|
|
|
75 |
|
76 |
+
audio_tokenizer=AudioTokenizerV2(
|
77 |
+
tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path
|
78 |
+
)
|
79 |
+
|
80 |
|
81 |
+
model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
|
82 |
|
83 |
+
#change the text
|
84 |
+
text="The election was won by businessman and politician, Moshood Abiola, but Babangida annulled the results, citing concerns over national security."
|
85 |
|
86 |
+
# change the language and voice
|
87 |
+
prompt=audio_tokenizer.create_prompt(text,lang="english",speaker_name="idera")
|
88 |
|
|
|
89 |
input_ids=audio_tokenizer.tokenize_prompt(prompt)
|
90 |
|
|
|
91 |
output = model.generate(
|
92 |
input_ids=input_ids,
|
93 |
temperature=0.1,
|
94 |
repetition_penalty=1.1,
|
95 |
max_length=4000,
|
96 |
+
#num_beams=5,# using a beam size helps for the local languages but not english
|
97 |
)
|
98 |
|
|
|
99 |
codes=audio_tokenizer.get_codes(output)
|
|
|
|
|
100 |
audio=audio_tokenizer.get_audio(codes)
|
|
|
|
|
101 |
IPython.display.Audio(audio,rate=24000)
|
102 |
+
torchaudio.save(f"Sample.wav", audio, sample_rate=24000)
|
103 |
|
|
|
|
|
104 |
```
|
105 |
|
106 |
### Simple Nigerian Accented-NewsReader
|