Spaces:
Runtime error
Runtime error
add device argument
Browse files- app.py +14 -12
- text/cleaners.py +25 -21
app.py
CHANGED
|
@@ -62,9 +62,9 @@ def create_tts_fn(model, hps, speaker_ids):
|
|
| 62 |
speaker_id = speaker_ids[speaker]
|
| 63 |
stn_tst = get_text(text, hps, is_symbol)
|
| 64 |
with no_grad():
|
| 65 |
-
x_tst = stn_tst.unsqueeze(0)
|
| 66 |
-
x_tst_lengths = LongTensor([stn_tst.size(0)])
|
| 67 |
-
sid = LongTensor([speaker_id])
|
| 68 |
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
|
| 69 |
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
| 70 |
del stn_tst, x_tst, x_tst_lengths, sid
|
|
@@ -94,10 +94,10 @@ def create_vc_fn(model, hps, speaker_ids):
|
|
| 94 |
y = y.unsqueeze(0)
|
| 95 |
spec = spectrogram_torch(y, hps.data.filter_length,
|
| 96 |
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
| 97 |
-
center=False)
|
| 98 |
-
spec_lengths = LongTensor([spec.size(-1)])
|
| 99 |
-
sid_src = LongTensor([original_speaker_id])
|
| 100 |
-
sid_tgt = LongTensor([target_speaker_id])
|
| 101 |
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
| 102 |
0, 0].data.cpu().float().numpy()
|
| 103 |
del y, spec, spec_lengths, sid_src, sid_tgt
|
|
@@ -125,10 +125,10 @@ def create_soft_vc_fn(model, hps, speaker_ids):
|
|
| 125 |
if sampling_rate != 16000:
|
| 126 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
| 127 |
with torch.inference_mode():
|
| 128 |
-
units = hubert.units(torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0))
|
| 129 |
with no_grad():
|
| 130 |
-
unit_lengths = LongTensor([units.size(1)])
|
| 131 |
-
sid = LongTensor([target_speaker_id])
|
| 132 |
audio = model.infer(units, unit_lengths, sid=sid, noise_scale=.667,
|
| 133 |
noise_scale_w=0.8)[0][0, 0].data.cpu().float().numpy()
|
| 134 |
del units, unit_lengths, sid
|
|
@@ -147,9 +147,11 @@ def create_to_symbol_fn(hps):
|
|
| 147 |
|
| 148 |
if __name__ == '__main__':
|
| 149 |
parser = argparse.ArgumentParser()
|
|
|
|
| 150 |
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
| 151 |
args = parser.parse_args()
|
| 152 |
|
|
|
|
| 153 |
models_tts = []
|
| 154 |
models_vc = []
|
| 155 |
models_soft_vc = []
|
|
@@ -171,7 +173,7 @@ if __name__ == '__main__':
|
|
| 171 |
n_speakers=hps.data.n_speakers,
|
| 172 |
**hps.model)
|
| 173 |
utils.load_checkpoint(model_path, model, None)
|
| 174 |
-
model.eval()
|
| 175 |
speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
|
| 176 |
speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
|
| 177 |
|
|
@@ -184,7 +186,7 @@ if __name__ == '__main__':
|
|
| 184 |
elif t == "soft-vits-vc":
|
| 185 |
models_soft_vc.append((name, cover_path, speakers, create_soft_vc_fn(model, hps, speaker_ids)))
|
| 186 |
|
| 187 |
-
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
|
| 188 |
|
| 189 |
app = gr.Blocks()
|
| 190 |
|
|
|
|
| 62 |
speaker_id = speaker_ids[speaker]
|
| 63 |
stn_tst = get_text(text, hps, is_symbol)
|
| 64 |
with no_grad():
|
| 65 |
+
x_tst = stn_tst.unsqueeze(0).to(device)
|
| 66 |
+
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
|
| 67 |
+
sid = LongTensor([speaker_id]).to(device)
|
| 68 |
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
|
| 69 |
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
| 70 |
del stn_tst, x_tst, x_tst_lengths, sid
|
|
|
|
| 94 |
y = y.unsqueeze(0)
|
| 95 |
spec = spectrogram_torch(y, hps.data.filter_length,
|
| 96 |
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
| 97 |
+
center=False).to(device)
|
| 98 |
+
spec_lengths = LongTensor([spec.size(-1)]).to(device)
|
| 99 |
+
sid_src = LongTensor([original_speaker_id]).to(device)
|
| 100 |
+
sid_tgt = LongTensor([target_speaker_id]).to(device)
|
| 101 |
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
| 102 |
0, 0].data.cpu().float().numpy()
|
| 103 |
del y, spec, spec_lengths, sid_src, sid_tgt
|
|
|
|
| 125 |
if sampling_rate != 16000:
|
| 126 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
| 127 |
with torch.inference_mode():
|
| 128 |
+
units = hubert.units(torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0).to(device))
|
| 129 |
with no_grad():
|
| 130 |
+
unit_lengths = LongTensor([units.size(1)]).to(device)
|
| 131 |
+
sid = LongTensor([target_speaker_id]).to(device)
|
| 132 |
audio = model.infer(units, unit_lengths, sid=sid, noise_scale=.667,
|
| 133 |
noise_scale_w=0.8)[0][0, 0].data.cpu().float().numpy()
|
| 134 |
del units, unit_lengths, sid
|
|
|
|
| 147 |
|
| 148 |
if __name__ == '__main__':
|
| 149 |
parser = argparse.ArgumentParser()
|
| 150 |
+
parser.add_argument('--device', type=str, default='cpu')
|
| 151 |
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
| 152 |
args = parser.parse_args()
|
| 153 |
|
| 154 |
+
device = torch.device(args.device)
|
| 155 |
models_tts = []
|
| 156 |
models_vc = []
|
| 157 |
models_soft_vc = []
|
|
|
|
| 173 |
n_speakers=hps.data.n_speakers,
|
| 174 |
**hps.model)
|
| 175 |
utils.load_checkpoint(model_path, model, None)
|
| 176 |
+
model.eval().to(device)
|
| 177 |
speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
|
| 178 |
speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
|
| 179 |
|
|
|
|
| 186 |
elif t == "soft-vits-vc":
|
| 187 |
models_soft_vc.append((name, cover_path, speakers, create_soft_vc_fn(model, hps, speaker_ids)))
|
| 188 |
|
| 189 |
+
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).to(device)
|
| 190 |
|
| 191 |
app = gr.Blocks()
|
| 192 |
|
text/cleaners.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
import re
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
def japanese_cleaners(text):
|
|
@@ -36,9 +39,9 @@ def zh_ja_mixture_cleaners(text):
|
|
| 36 |
from text.mandarin import chinese_to_romaji
|
| 37 |
from text.japanese import japanese_to_romaji_with_accent
|
| 38 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 39 |
-
lambda x: chinese_to_romaji(x.group(1))+' ', text)
|
| 40 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
|
| 41 |
-
x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
|
| 42 |
text = re.sub(r'\s+$', '', text)
|
| 43 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 44 |
return text
|
|
@@ -58,15 +61,15 @@ def cjks_cleaners(text):
|
|
| 58 |
from text.sanskrit import devanagari_to_ipa
|
| 59 |
from text.english import english_to_lazy_ipa
|
| 60 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 61 |
-
lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
|
| 62 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
| 63 |
-
lambda x: japanese_to_ipa(x.group(1))+' ', text)
|
| 64 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
| 65 |
-
lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
|
| 66 |
text = re.sub(r'\[SA\](.*?)\[SA\]',
|
| 67 |
-
lambda x: devanagari_to_ipa(x.group(1))+' ', text)
|
| 68 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
| 69 |
-
lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
|
| 70 |
text = re.sub(r'\s+$', '', text)
|
| 71 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 72 |
return text
|
|
@@ -78,13 +81,13 @@ def cjke_cleaners(text):
|
|
| 78 |
from text.korean import korean_to_ipa
|
| 79 |
from text.english import english_to_ipa2
|
| 80 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
|
| 81 |
-
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
|
| 82 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
|
| 83 |
-
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
|
| 84 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
| 85 |
-
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
| 86 |
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
|
| 87 |
-
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
|
| 88 |
text = re.sub(r'\s+$', '', text)
|
| 89 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 90 |
return text
|
|
@@ -96,13 +99,13 @@ def cjke_cleaners2(text):
|
|
| 96 |
from text.korean import korean_to_ipa
|
| 97 |
from text.english import english_to_ipa2
|
| 98 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 99 |
-
lambda x: chinese_to_ipa(x.group(1))+' ', text)
|
| 100 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
| 101 |
-
lambda x: japanese_to_ipa2(x.group(1))+' ', text)
|
| 102 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
| 103 |
-
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
| 104 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
| 105 |
-
lambda x: english_to_ipa2(x.group(1))+' ', text)
|
| 106 |
text = re.sub(r'\s+$', '', text)
|
| 107 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 108 |
return text
|
|
@@ -130,17 +133,18 @@ def chinese_dialect_cleaners(text):
|
|
| 130 |
from text.english import english_to_lazy_ipa2
|
| 131 |
from text.ngu_dialect import ngu_dialect_to_ipa
|
| 132 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 133 |
-
lambda x: chinese_to_ipa2(x.group(1))+' ', text)
|
| 134 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
| 135 |
-
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
|
| 136 |
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
|
| 137 |
-
|
|
|
|
| 138 |
text = re.sub(r'\[GD\](.*?)\[GD\]',
|
| 139 |
-
lambda x: cantonese_to_ipa(x.group(1))+' ', text)
|
| 140 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
| 141 |
-
lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
|
| 142 |
text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
|
| 143 |
-
1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
|
| 144 |
text = re.sub(r'\s+$', '', text)
|
| 145 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 146 |
return text
|
|
|
|
| 1 |
import re
|
| 2 |
+
import pyopenjtalk
|
| 3 |
+
|
| 4 |
+
pyopenjtalk._lazy_init()
|
| 5 |
|
| 6 |
|
| 7 |
def japanese_cleaners(text):
|
|
|
|
| 39 |
from text.mandarin import chinese_to_romaji
|
| 40 |
from text.japanese import japanese_to_romaji_with_accent
|
| 41 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 42 |
+
lambda x: chinese_to_romaji(x.group(1)) + ' ', text)
|
| 43 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
|
| 44 |
+
x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…') + ' ', text)
|
| 45 |
text = re.sub(r'\s+$', '', text)
|
| 46 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 47 |
return text
|
|
|
|
| 61 |
from text.sanskrit import devanagari_to_ipa
|
| 62 |
from text.english import english_to_lazy_ipa
|
| 63 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 64 |
+
lambda x: chinese_to_lazy_ipa(x.group(1)) + ' ', text)
|
| 65 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
| 66 |
+
lambda x: japanese_to_ipa(x.group(1)) + ' ', text)
|
| 67 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
| 68 |
+
lambda x: korean_to_lazy_ipa(x.group(1)) + ' ', text)
|
| 69 |
text = re.sub(r'\[SA\](.*?)\[SA\]',
|
| 70 |
+
lambda x: devanagari_to_ipa(x.group(1)) + ' ', text)
|
| 71 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
| 72 |
+
lambda x: english_to_lazy_ipa(x.group(1)) + ' ', text)
|
| 73 |
text = re.sub(r'\s+$', '', text)
|
| 74 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 75 |
return text
|
|
|
|
| 81 |
from text.korean import korean_to_ipa
|
| 82 |
from text.english import english_to_ipa2
|
| 83 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
|
| 84 |
+
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn') + ' ', text)
|
| 85 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
|
| 86 |
+
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz') + ' ', text)
|
| 87 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
| 88 |
+
lambda x: korean_to_ipa(x.group(1)) + ' ', text)
|
| 89 |
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
|
| 90 |
+
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u') + ' ', text)
|
| 91 |
text = re.sub(r'\s+$', '', text)
|
| 92 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 93 |
return text
|
|
|
|
| 99 |
from text.korean import korean_to_ipa
|
| 100 |
from text.english import english_to_ipa2
|
| 101 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 102 |
+
lambda x: chinese_to_ipa(x.group(1)) + ' ', text)
|
| 103 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
| 104 |
+
lambda x: japanese_to_ipa2(x.group(1)) + ' ', text)
|
| 105 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
| 106 |
+
lambda x: korean_to_ipa(x.group(1)) + ' ', text)
|
| 107 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
| 108 |
+
lambda x: english_to_ipa2(x.group(1)) + ' ', text)
|
| 109 |
text = re.sub(r'\s+$', '', text)
|
| 110 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 111 |
return text
|
|
|
|
| 133 |
from text.english import english_to_lazy_ipa2
|
| 134 |
from text.ngu_dialect import ngu_dialect_to_ipa
|
| 135 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 136 |
+
lambda x: chinese_to_ipa2(x.group(1)) + ' ', text)
|
| 137 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
| 138 |
+
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ') + ' ', text)
|
| 139 |
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
|
| 140 |
+
'˧˧˦').replace(
|
| 141 |
+
'6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e') + ' ', text)
|
| 142 |
text = re.sub(r'\[GD\](.*?)\[GD\]',
|
| 143 |
+
lambda x: cantonese_to_ipa(x.group(1)) + ' ', text)
|
| 144 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
| 145 |
+
lambda x: english_to_lazy_ipa2(x.group(1)) + ' ', text)
|
| 146 |
text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
|
| 147 |
+
1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ') + ' ', text)
|
| 148 |
text = re.sub(r'\s+$', '', text)
|
| 149 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 150 |
return text
|