Spaces:
Running
Running
Commit
·
675a486
1
Parent(s):
d430de8
Switch phonemizer
Browse files- ljspeechimportable.py +5 -6
- styletts2importable.py +8 -8
ljspeechimportable.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
from cached_path import cached_path
|
| 2 |
-
from dp.phonemizer import Phonemizer
|
| 3 |
|
| 4 |
|
| 5 |
import torch
|
|
@@ -68,10 +67,10 @@ def compute_style(ref_dicts):
|
|
| 68 |
return reference_embeddings
|
| 69 |
|
| 70 |
# load phonemizer
|
| 71 |
-
|
| 72 |
-
|
| 73 |
|
| 74 |
-
phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
|
| 75 |
|
| 76 |
|
| 77 |
config = yaml.safe_load(open(str(cached_path('hf://yl4579/StyleTTS2-LJSpeech/Models/LJSpeech/config.yml'))))
|
|
@@ -128,7 +127,7 @@ sampler = DiffusionSampler(
|
|
| 128 |
def inference(text, noise, diffusion_steps=5, embedding_scale=1):
|
| 129 |
text = text.strip()
|
| 130 |
text = text.replace('"', '')
|
| 131 |
-
ps =
|
| 132 |
ps = word_tokenize(ps[0])
|
| 133 |
ps = ' '.join(ps)
|
| 134 |
|
|
@@ -177,7 +176,7 @@ def inference(text, noise, diffusion_steps=5, embedding_scale=1):
|
|
| 177 |
def LFinference(text, s_prev, noise, alpha=0.7, diffusion_steps=5, embedding_scale=1):
|
| 178 |
text = text.strip()
|
| 179 |
text = text.replace('"', '')
|
| 180 |
-
ps =
|
| 181 |
ps = word_tokenize(ps[0])
|
| 182 |
ps = ' '.join(ps)
|
| 183 |
|
|
|
|
| 1 |
from cached_path import cached_path
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
import torch
|
|
|
|
| 67 |
return reference_embeddings
|
| 68 |
|
| 69 |
# load phonemizer
|
| 70 |
+
import phonemizer
|
| 71 |
+
global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True, words_mismatch='ignore')
|
| 72 |
|
| 73 |
+
# phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
|
| 74 |
|
| 75 |
|
| 76 |
config = yaml.safe_load(open(str(cached_path('hf://yl4579/StyleTTS2-LJSpeech/Models/LJSpeech/config.yml'))))
|
|
|
|
| 127 |
def inference(text, noise, diffusion_steps=5, embedding_scale=1):
|
| 128 |
text = text.strip()
|
| 129 |
text = text.replace('"', '')
|
| 130 |
+
ps = global_phonemizer.phonemize([text])
|
| 131 |
ps = word_tokenize(ps[0])
|
| 132 |
ps = ' '.join(ps)
|
| 133 |
|
|
|
|
| 176 |
def LFinference(text, s_prev, noise, alpha=0.7, diffusion_steps=5, embedding_scale=1):
|
| 177 |
text = text.strip()
|
| 178 |
text = text.replace('"', '')
|
| 179 |
+
ps = global_phonemizer.phonemize([text])
|
| 180 |
ps = word_tokenize(ps[0])
|
| 181 |
ps = ' '.join(ps)
|
| 182 |
|
styletts2importable.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from cached_path import cached_path
|
| 2 |
|
| 3 |
-
from dp.phonemizer import Phonemizer
|
| 4 |
print("NLTK")
|
| 5 |
import nltk
|
| 6 |
nltk.download('punkt')
|
|
@@ -73,9 +73,9 @@ elif torch.backends.mps.is_available():
|
|
| 73 |
print("MPS would be available but cannot be used rn")
|
| 74 |
# device = 'mps'
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
|
| 79 |
|
| 80 |
|
| 81 |
# config = yaml.safe_load(open("Models/LibriTTS/config.yml"))
|
|
@@ -133,7 +133,7 @@ sampler = DiffusionSampler(
|
|
| 133 |
|
| 134 |
def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):
|
| 135 |
text = text.strip()
|
| 136 |
-
ps =
|
| 137 |
ps = word_tokenize(ps[0])
|
| 138 |
ps = ' '.join(ps)
|
| 139 |
tokens = textclenaer(ps)
|
|
@@ -202,7 +202,7 @@ def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding
|
|
| 202 |
|
| 203 |
def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1):
|
| 204 |
text = text.strip()
|
| 205 |
-
ps =
|
| 206 |
ps = word_tokenize(ps[0])
|
| 207 |
ps = ' '.join(ps)
|
| 208 |
ps = ps.replace('``', '"')
|
|
@@ -279,7 +279,7 @@ def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion
|
|
| 279 |
|
| 280 |
def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):
|
| 281 |
text = text.strip()
|
| 282 |
-
ps =
|
| 283 |
ps = word_tokenize(ps[0])
|
| 284 |
ps = ' '.join(ps)
|
| 285 |
|
|
@@ -288,7 +288,7 @@ def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=
|
|
| 288 |
tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
|
| 289 |
|
| 290 |
ref_text = ref_text.strip()
|
| 291 |
-
ps =
|
| 292 |
ps = word_tokenize(ps[0])
|
| 293 |
ps = ' '.join(ps)
|
| 294 |
|
|
|
|
| 1 |
from cached_path import cached_path
|
| 2 |
|
| 3 |
+
# from dp.phonemizer import Phonemizer
|
| 4 |
print("NLTK")
|
| 5 |
import nltk
|
| 6 |
nltk.download('punkt')
|
|
|
|
| 73 |
print("MPS would be available but cannot be used rn")
|
| 74 |
# device = 'mps'
|
| 75 |
|
| 76 |
+
import phonemizer
|
| 77 |
+
global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
|
| 78 |
+
# phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
|
| 79 |
|
| 80 |
|
| 81 |
# config = yaml.safe_load(open("Models/LibriTTS/config.yml"))
|
|
|
|
| 133 |
|
| 134 |
def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):
|
| 135 |
text = text.strip()
|
| 136 |
+
ps = global_phonemizer.phonemize([text])
|
| 137 |
ps = word_tokenize(ps[0])
|
| 138 |
ps = ' '.join(ps)
|
| 139 |
tokens = textclenaer(ps)
|
|
|
|
| 202 |
|
| 203 |
def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1):
|
| 204 |
text = text.strip()
|
| 205 |
+
ps = global_phonemizer.phonemize([text])
|
| 206 |
ps = word_tokenize(ps[0])
|
| 207 |
ps = ' '.join(ps)
|
| 208 |
ps = ps.replace('``', '"')
|
|
|
|
| 279 |
|
| 280 |
def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):
|
| 281 |
text = text.strip()
|
| 282 |
+
ps = global_phonemizer.phonemize([text])
|
| 283 |
ps = word_tokenize(ps[0])
|
| 284 |
ps = ' '.join(ps)
|
| 285 |
|
|
|
|
| 288 |
tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
|
| 289 |
|
| 290 |
ref_text = ref_text.strip()
|
| 291 |
+
ps = global_phonemizer.phonemize([ref_text])
|
| 292 |
ps = word_tokenize(ps[0])
|
| 293 |
ps = ' '.join(ps)
|
| 294 |
|