moe-tts

Runtime error

App Files Files Community

skytnt commited on Aug 22, 2022

Commit

8935672

1 Parent(s): 7f43d7a

add model

Browse files

Files changed (6) hide show

app.py +38 -20
saved_model/{config.json → 0/config.json} +0 -0
saved_model/{model.pth → 0/model.pth} +0 -0
saved_model/1/config.json +3 -0
saved_model/1/model.pth +3 -0
text/cleaners.py +47 -40

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from text import text_to_sequence
 from mel_processing import spectrogram_torch
-def get_text(text):
     text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
     if hps.data.add_blank:
         text_norm = commons.intersperse(text_norm, 0)
@@ -22,10 +22,12 @@ def get_text(text):
     return text_norm
-def tts_fn(text, speaker_id):
     if len(text) > 150:
         return "Error: Text is too long", None
-    stn_tst = get_text(text)
     with no_grad():
         x_tst = stn_tst.unsqueeze(0)
         x_tst_lengths = LongTensor([stn_tst.size(0)])
@@ -35,13 +37,20 @@ def tts_fn(text, speaker_id):
     return "Success", (hps.data.sampling_rate, audio)
-def vc_fn(original_speaker_id, target_speaker_id, input_audio):
     if input_audio is None:
         return "You need to upload an audio", None
     sampling_rate, audio = input_audio
     duration = audio.shape[0] / sampling_rate
     if duration > 30:
         return "Error: Audio is too long", None
     audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
     if len(audio.shape) > 1:
         audio = librosa.to_mono(audio.transpose(1, 0))
@@ -62,17 +71,26 @@ def vc_fn(original_speaker_id, target_speaker_id, input_audio):
 if __name__ == '__main__':
-    config_path = "saved_model/config.json"
-    model_path = "saved_model/model.pth"
-    hps = utils.get_hparams_from_file(config_path)
-    model = SynthesizerTrn(
-        len(hps.symbols),
-        hps.data.filter_length // 2 + 1,
-        hps.train.segment_size // hps.data.hop_length,
-        n_speakers=hps.data.n_speakers,
-        **hps.model)
-    utils.load_checkpoint(model_path, model, None)
-    model.eval()
     app = gr.Blocks()
@@ -85,16 +103,16 @@ if __name__ == '__main__':
             with gr.TabItem("TTS"):
                 with gr.Column():
                     tts_input1 = gr.TextArea(label="Text (150 words limitation)", value="こんにちは。")
-                    tts_input2 = gr.Dropdown(label="Speaker", choices=hps.speakers, type="index", value=hps.speakers[0])
                     tts_submit = gr.Button("Generate", variant="primary")
                     tts_output1 = gr.Textbox(label="Output Message")
                     tts_output2 = gr.Audio(label="Output Audio")
             with gr.TabItem("Voice Conversion"):
                 with gr.Column():
-                    vc_input1 = gr.Dropdown(label="Original Speaker", choices=hps.speakers, type="index",
-                                            value=hps.speakers[0])
-                    vc_input2 = gr.Dropdown(label="Target Speaker", choices=hps.speakers, type="index",
-                                            value=hps.speakers[1])
                     vc_input3 = gr.Audio(label="Input Audio (30s limitation)")
                     vc_submit = gr.Button("Convert", variant="primary")
                     vc_output1 = gr.Textbox(label="Output Message")

 from mel_processing import spectrogram_torch
+def get_text(text, hps):
     text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
     if hps.data.add_blank:
         text_norm = commons.intersperse(text_norm, 0)
     return text_norm
+def tts_fn(text, speaker):
     if len(text) > 150:
         return "Error: Text is too long", None
+    model, hps = models[model_idx[speaker]]
+    speaker_id = speaker_idx[speaker]
+    stn_tst = get_text(text, hps)
     with no_grad():
         x_tst = stn_tst.unsqueeze(0)
         x_tst_lengths = LongTensor([stn_tst.size(0)])
     return "Success", (hps.data.sampling_rate, audio)
+def vc_fn(original_speaker, target_speaker, input_audio):
     if input_audio is None:
         return "You need to upload an audio", None
     sampling_rate, audio = input_audio
     duration = audio.shape[0] / sampling_rate
     if duration > 30:
         return "Error: Audio is too long", None
+    if model_idx[original_speaker] != model_idx[target_speaker]:
+        return "Error: Can not convert voice between different model", None
+    model, hps = models[model_idx[original_speaker]]
+    original_speaker_id = speaker_idx[original_speaker]
+    target_speaker_id = speaker_idx[target_speaker]
     audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
     if len(audio.shape) > 1:
         audio = librosa.to_mono(audio.transpose(1, 0))
 if __name__ == '__main__':
+    models = []
+    model_idx = []
+    speaker_idx = []
+    speakers = []
+    for i in range(0, 2):
+        config_path = f"saved_model/{i}/config.json"
+        model_path = f"saved_model/{i}/model.pth"
+        hps = utils.get_hparams_from_file(config_path)
+        model = SynthesizerTrn(
+            len(hps.symbols),
+            hps.data.filter_length // 2 + 1,
+            hps.train.segment_size // hps.data.hop_length,
+            n_speakers=hps.data.n_speakers,
+            **hps.model)
+        utils.load_checkpoint(model_path, model, None)
+        model.eval()
+        models.append((model, hps))
+        speakers = speakers + [f"model{i}/{x}" for x in hps.speakers]
+        model_idx = model_idx + [i] * len(hps.speakers)
+        speaker_idx = speaker_idx + list(range(0, len(hps.speakers)))
     app = gr.Blocks()
             with gr.TabItem("TTS"):
                 with gr.Column():
                     tts_input1 = gr.TextArea(label="Text (150 words limitation)", value="こんにちは。")
+                    tts_input2 = gr.Dropdown(label="Speaker", choices=speakers, type="index", value=speakers[0])
                     tts_submit = gr.Button("Generate", variant="primary")
                     tts_output1 = gr.Textbox(label="Output Message")
                     tts_output2 = gr.Audio(label="Output Audio")
             with gr.TabItem("Voice Conversion"):
                 with gr.Column():
+                    vc_input1 = gr.Dropdown(label="Original Speaker", choices=speakers, type="index",
+                                            value=speakers[0])
+                    vc_input2 = gr.Dropdown(label="Target Speaker", choices=speakers, type="index",
+                                            value=speakers[1])
                     vc_input3 = gr.Audio(label="Input Audio (30s limitation)")
                     vc_submit = gr.Button("Convert", variant="primary")
                     vc_output1 = gr.Textbox(label="Output Message")

saved_model/{config.json → 0/config.json} RENAMED Viewed

File without changes

saved_model/{model.pth → 0/model.pth} RENAMED Viewed

File without changes

saved_model/1/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8022ffb2ae81ff2c84edde380bbdfc60b9ad933f767c5187d4fcfd5c964315b1
+size 1302

saved_model/1/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f07377ad8af65adaad59315b40efe67c020f51dc526da66f4e11f812687392e
+size 158884173

text/cleaners.py CHANGED Viewed

@@ -1,51 +1,58 @@
 import re
 from unidecode import unidecode
 import pyopenjtalk
 pyopenjtalk._lazy_init()
 # Regular expression matching Japanese without punctuation marks:
-_japanese_characters = re.compile(r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 # Regular expression matching non-Japanese characters or punctuation marks:
-_japanese_marks = re.compile(r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 def japanese_cleaners(text):
-  '''Pipeline for notating accent in Japanese text.'''
-  '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
-  sentences = re.split(_japanese_marks, text)
-  marks = re.findall(_japanese_marks, text)
-  text = ''
-  for i, sentence in enumerate(sentences):
-    if re.match(_japanese_characters, sentence):
-      if text!='':
-        text+=' '
-      labels = pyopenjtalk.extract_fullcontext(sentence)
-      for n, label in enumerate(labels):
-        phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
-        if phoneme not in ['sil','pau']:
-          text += phoneme.replace('ch','ʧ').replace('sh','ʃ').replace('cl','Q')
-        else:
-          continue
-        n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
-        a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
-        a2 = int(re.search(r"\+(\d+)\+", label).group(1))
-        a3 = int(re.search(r"\+(\d+)/", label).group(1))
-        if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil','pau']:
-          a2_next=-1
-        else:
-          a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
-        # Accent phrase boundary
-        if a3 == 1 and a2_next == 1:
-          text += ' '
-        # Falling
-        elif a1 == 0 and a2_next == a2 + 1 and a2 != n_moras:
-          text += '↓'
-        # Rising
-        elif a2 == 1 and a2_next == 2:
-          text += '↑'
-    if i<len(marks):
-      text += unidecode(marks[i]).replace(' ','')
-  if re.match('[A-Za-z]',text[-1]):
-    text += '.'
-  return text

 import re
 from unidecode import unidecode
 import pyopenjtalk
 pyopenjtalk._lazy_init()
 # Regular expression matching Japanese without punctuation marks:
+_japanese_characters = re.compile(
+    r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 # Regular expression matching non-Japanese characters or punctuation marks:
+_japanese_marks = re.compile(
+    r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 def japanese_cleaners(text):
+    '''Pipeline for notating accent in Japanese text.'''
+    '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
+    sentences = re.split(_japanese_marks, text)
+    marks = re.findall(_japanese_marks, text)
+    text = ''
+    for i, sentence in enumerate(sentences):
+        if re.match(_japanese_characters, sentence):
+            if text != '':
+                text += ' '
+            labels = pyopenjtalk.extract_fullcontext(sentence)
+            for n, label in enumerate(labels):
+                phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
+                if phoneme not in ['sil', 'pau']:
+                    text += phoneme.replace('ch', 'ʧ').replace('sh', 'ʃ').replace('cl', 'Q')
+                else:
+                    continue
+                n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
+                a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
+                a2 = int(re.search(r"\+(\d+)\+", label).group(1))
+                a3 = int(re.search(r"\+(\d+)/", label).group(1))
+                if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
+                    a2_next = -1
+                else:
+                    a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
+                # Accent phrase boundary
+                if a3 == 1 and a2_next == 1:
+                    text += ' '
+                # Falling
+                elif a1 == 0 and a2_next == a2 + 1 and a2 != n_moras:
+                    text += '↓'
+                # Rising
+                elif a2 == 1 and a2_next == 2:
+                    text += '↑'
+        if i < len(marks):
+            text += unidecode(marks[i]).replace(' ', '')
+    if re.match('[A-Za-z]', text[-1]):
+        text += '.'
+    return text
+def japanese_cleaners2(text):
+    return japanese_cleaners(text).replace('ts', 'ʦ')