Spaces:
Runtime error
Runtime error
init
Browse files- Dockerfile +7 -0
- melotts_training.py +799 -0
- requirements.txt +1 -0
Dockerfile
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
WORKDIR /app
|
| 3 |
+
COPY . .
|
| 4 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 5 |
+
EXPOSE 7860
|
| 6 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
| 7 |
+
CMD ["python", "app.py"]
|
melotts_training.py
ADDED
|
@@ -0,0 +1,799 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""melotts training.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colab.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1srmto1Bf7xQl7la1-5cTZOvbTnL-KWDG
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
# Fetch `notebook_utils` module
|
| 11 |
+
import requests
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
if not Path("notebook_utils.py").exists():
|
| 15 |
+
|
| 16 |
+
r = requests.get(
|
| 17 |
+
url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py",
|
| 18 |
+
)
|
| 19 |
+
open("notebook_utils.py", "w").write(r.text)
|
| 20 |
+
|
| 21 |
+
if not Path("cmd_helper.py").exists():
|
| 22 |
+
r = requests.get(
|
| 23 |
+
url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py",
|
| 24 |
+
)
|
| 25 |
+
open("cmd_helper.py", "w").write(r.text)
|
| 26 |
+
|
| 27 |
+
if not Path("pip_helper.py").exists():
|
| 28 |
+
r = requests.get(
|
| 29 |
+
url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py",
|
| 30 |
+
)
|
| 31 |
+
open("pip_helper.py", "w").write(r.text)
|
| 32 |
+
|
| 33 |
+
# !!! have to restart session
|
| 34 |
+
|
| 35 |
+
from pathlib import Path
|
| 36 |
+
|
| 37 |
+
from cmd_helper import clone_repo
|
| 38 |
+
from pip_helper import pip_install
|
| 39 |
+
import platform
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
repo_dir = Path("OpenVoice")
|
| 43 |
+
|
| 44 |
+
clone_repo("https://github.com/myshell-ai/OpenVoice")
|
| 45 |
+
orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py")
|
| 46 |
+
english_path = Path("OpenVoice/openvoice/text/english.py")
|
| 47 |
+
|
| 48 |
+
if not orig_english_path.exists():
|
| 49 |
+
orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py")
|
| 50 |
+
english_path = Path("OpenVoice/openvoice/text/english.py")
|
| 51 |
+
|
| 52 |
+
english_path.rename(orig_english_path)
|
| 53 |
+
|
| 54 |
+
with orig_english_path.open("r") as f:
|
| 55 |
+
data = f.read()
|
| 56 |
+
data = data.replace("unidecode", "anyascii")
|
| 57 |
+
with english_path.open("w") as out_f:
|
| 58 |
+
out_f.write(data)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# fix a problem with silero downloading and installing
|
| 62 |
+
with Path("OpenVoice/openvoice/se_extractor.py").open("r") as orig_file:
|
| 63 |
+
data = orig_file.read()
|
| 64 |
+
data = data.replace('method="silero"', 'method="silero:3.0"')
|
| 65 |
+
with Path("OpenVoice/openvoice/se_extractor.py").open("w") as out_f:
|
| 66 |
+
out_f.write(data)
|
| 67 |
+
|
| 68 |
+
# clone melotts
|
| 69 |
+
clone_repo("https://github.com/myshell-ai/MeloTTS")
|
| 70 |
+
|
| 71 |
+
pip_install(
|
| 72 |
+
"--no-deps",
|
| 73 |
+
"librosa==0.9.1",
|
| 74 |
+
"pydub==0.25.1",
|
| 75 |
+
"tqdm",
|
| 76 |
+
"inflect==7.0.0",
|
| 77 |
+
"pypinyin==0.50.0",
|
| 78 |
+
"openvino>=2025.0",
|
| 79 |
+
)
|
| 80 |
+
# Since we don't convert Japanese models, we have removed many heavy Japanese-related pip install dependencies. If you want to try, we recommend using a Python 3.10 environment on Ubuntu and uncommenting the relevant lines.
|
| 81 |
+
pip_install(
|
| 82 |
+
"--extra-index-url",
|
| 83 |
+
"https://download.pytorch.org/whl/cpu",
|
| 84 |
+
# "mecab-python3==1.0.9",
|
| 85 |
+
"nncf",
|
| 86 |
+
"wavmark>=0.0.3",
|
| 87 |
+
"faster-whisper>=0.9.0",
|
| 88 |
+
"eng_to_ipa==0.0.2",
|
| 89 |
+
"cn2an==0.5.22",
|
| 90 |
+
"jieba==0.42.1",
|
| 91 |
+
"langid==1.1.6",
|
| 92 |
+
"ipywebrtc",
|
| 93 |
+
"anyascii==0.3.2",
|
| 94 |
+
"torch>=2.1",
|
| 95 |
+
"torchaudio",
|
| 96 |
+
"cached_path",
|
| 97 |
+
"transformers>=4.38,<5.0",
|
| 98 |
+
"num2words==0.5.12",
|
| 99 |
+
# "unidic_lite==1.0.8",
|
| 100 |
+
# "unidic==1.1.0",
|
| 101 |
+
"pykakasi==2.2.1",
|
| 102 |
+
# "fugashi==1.3.0",
|
| 103 |
+
"g2p_en==2.1.0",
|
| 104 |
+
"jamo==0.4.1",
|
| 105 |
+
"gruut[de,es,fr]==2.2.3",
|
| 106 |
+
"g2pkk>=0.1.1",
|
| 107 |
+
"dtw-python",
|
| 108 |
+
"more-itertools",
|
| 109 |
+
"tiktoken",
|
| 110 |
+
"tensorboard==2.16.2",
|
| 111 |
+
"loguru==0.7.2",
|
| 112 |
+
"nltk",
|
| 113 |
+
"gradio",
|
| 114 |
+
)
|
| 115 |
+
pip_install("--no-deps", "whisper-timestamped>=1.14.2", "openai-whisper")
|
| 116 |
+
|
| 117 |
+
if platform.system() == "Darwin":
|
| 118 |
+
pip_install("numpy<2.0")
|
| 119 |
+
|
| 120 |
+
# fix the problem of `module 'botocore.exceptions' has no attribute 'HTTPClientError'`
|
| 121 |
+
pip_install("--upgrade", "botocore")
|
| 122 |
+
|
| 123 |
+
# donwload nltk data
|
| 124 |
+
import nltk
|
| 125 |
+
|
| 126 |
+
nltk.download("averaged_perceptron_tagger_eng")
|
| 127 |
+
|
| 128 |
+
# install unidic
|
| 129 |
+
# !python -m unidic download
|
| 130 |
+
|
| 131 |
+
# remove Japanese-related module in MeloTTS to fix dependencies issue
|
| 132 |
+
# If you want to use Japanese, please do not modify these files
|
| 133 |
+
import re
|
| 134 |
+
|
| 135 |
+
with Path("MeloTTS/melo/text/english.py").open("r", encoding="utf-8") as orig_file:
|
| 136 |
+
data = orig_file.read()
|
| 137 |
+
japanese_import = "from .japanese import distribute_phone"
|
| 138 |
+
replacement_function = """
|
| 139 |
+
def distribute_phone(n_phone, n_word):
|
| 140 |
+
phones_per_word = [0] * n_word
|
| 141 |
+
for task in range(n_phone):
|
| 142 |
+
min_tasks = min(phones_per_word)
|
| 143 |
+
min_index = phones_per_word.index(min_tasks)
|
| 144 |
+
phones_per_word[min_index] += 1
|
| 145 |
+
return phones_per_word
|
| 146 |
+
"""
|
| 147 |
+
data = data.replace(japanese_import, replacement_function) # replace `from .japanese import distribute_phone` with the function
|
| 148 |
+
with Path("MeloTTS/melo/text/english.py").open("w", encoding="utf-8") as out_f:
|
| 149 |
+
out_f.write(data)
|
| 150 |
+
|
| 151 |
+
with Path("MeloTTS/melo/text/__init__.py").open("r", encoding="utf-8") as orig_file:
|
| 152 |
+
data = orig_file.read()
|
| 153 |
+
data = data.replace("from .japanese_bert import get_bert_feature as jp_bert", "")
|
| 154 |
+
data = data.replace("from .spanish_bert import get_bert_feature as sp_bert", "")
|
| 155 |
+
data = data.replace("from .french_bert import get_bert_feature as fr_bert", "")
|
| 156 |
+
data = data.replace("from .korean import get_bert_feature as kr_bert", "")
|
| 157 |
+
# Replace the lang_bert_func_map dictionary, keeping only the keys ZH, EN, and ZH_MIX_EN
|
| 158 |
+
pattern = re.compile(r"lang_bert_func_map\s*=\s*\{[^}]+\}", re.DOTALL)
|
| 159 |
+
|
| 160 |
+
replacement = """lang_bert_func_map = {
|
| 161 |
+
"ZH": zh_bert,
|
| 162 |
+
"EN": en_bert,
|
| 163 |
+
"ZH_MIX_EN": zh_mix_en_bert,
|
| 164 |
+
}"""
|
| 165 |
+
data = pattern.sub(replacement, data)
|
| 166 |
+
|
| 167 |
+
with Path("MeloTTS/melo/text/__init__.py").open("w", encoding="utf-8") as out_f:
|
| 168 |
+
out_f.write(data)
|
| 169 |
+
|
| 170 |
+
# clean the modules
|
| 171 |
+
for filename in ["japanese.py", "japanese_bert.py"]:
|
| 172 |
+
Path(f"MeloTTS/melo/text/{filename}").write_text("", encoding="utf-8")
|
| 173 |
+
|
| 174 |
+
import os
|
| 175 |
+
import torch
|
| 176 |
+
import openvino as ov
|
| 177 |
+
import ipywidgets as widgets
|
| 178 |
+
from IPython.display import Audio
|
| 179 |
+
from notebook_utils import download_file, device_widget
|
| 180 |
+
|
| 181 |
+
core = ov.Core()
|
| 182 |
+
|
| 183 |
+
from openvoice.api import ToneColorConverter, OpenVoiceBaseClass
|
| 184 |
+
import openvoice.se_extractor as se_extractor
|
| 185 |
+
from melo.api import TTS
|
| 186 |
+
|
| 187 |
+
CKPT_BASE_PATH = Path("checkpoints")
|
| 188 |
+
|
| 189 |
+
base_speakers_suffix = CKPT_BASE_PATH / "base_speakers" / "ses"
|
| 190 |
+
converter_suffix = CKPT_BASE_PATH / "converter"
|
| 191 |
+
|
| 192 |
+
melotts_chinese_suffix = CKPT_BASE_PATH / "MeloTTS-Chinese"
|
| 193 |
+
melotts_english_suffix = CKPT_BASE_PATH / "MeloTTS-English-v3"
|
| 194 |
+
|
| 195 |
+
def download_from_hf_hub(repo_id, filename, local_dir="./"):
|
| 196 |
+
from huggingface_hub import hf_hub_download
|
| 197 |
+
|
| 198 |
+
local_path = Path(local_dir)
|
| 199 |
+
hf_hub_download(repo_id=repo_id, filename=filename, local_dir=local_path)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
# Download OpenVoice2
|
| 203 |
+
download_from_hf_hub("myshell-ai/OpenVoiceV2", "converter/checkpoint.pth", CKPT_BASE_PATH)
|
| 204 |
+
download_from_hf_hub("myshell-ai/OpenVoiceV2", "converter/config.json", CKPT_BASE_PATH)
|
| 205 |
+
|
| 206 |
+
download_from_hf_hub("myshell-ai/OpenVoiceV2", "base_speakers/ses/en-newest.pth", CKPT_BASE_PATH)
|
| 207 |
+
download_from_hf_hub("myshell-ai/OpenVoiceV2", "base_speakers/ses/zh.pth", CKPT_BASE_PATH)
|
| 208 |
+
|
| 209 |
+
# Download MeloTTS
|
| 210 |
+
download_from_hf_hub("myshell-ai/MeloTTS-Chinese", "checkpoint.pth", melotts_chinese_suffix)
|
| 211 |
+
download_from_hf_hub("myshell-ai/MeloTTS-Chinese", "config.json", melotts_chinese_suffix)
|
| 212 |
+
download_from_hf_hub("myshell-ai/MeloTTS-English-v3", "checkpoint.pth", melotts_english_suffix)
|
| 213 |
+
download_from_hf_hub("myshell-ai/MeloTTS-English-v3", "config.json", melotts_english_suffix)
|
| 214 |
+
|
| 215 |
+
class OVSynthesizerTTSWrapper(torch.nn.Module):
|
| 216 |
+
"""
|
| 217 |
+
Wrapper for SynthesizerTrn model from MeloTTS to make it compatible with Torch-style inference.
|
| 218 |
+
"""
|
| 219 |
+
|
| 220 |
+
def __init__(self, model, language):
|
| 221 |
+
super().__init__()
|
| 222 |
+
self.model = model
|
| 223 |
+
self.language = language
|
| 224 |
+
|
| 225 |
+
def forward(
|
| 226 |
+
self,
|
| 227 |
+
x,
|
| 228 |
+
x_lengths,
|
| 229 |
+
sid,
|
| 230 |
+
tone,
|
| 231 |
+
language,
|
| 232 |
+
bert,
|
| 233 |
+
ja_bert,
|
| 234 |
+
noise_scale,
|
| 235 |
+
length_scale,
|
| 236 |
+
noise_scale_w,
|
| 237 |
+
sdp_ratio,
|
| 238 |
+
):
|
| 239 |
+
"""
|
| 240 |
+
Forward call to the underlying SynthesizerTrn model. Accepts arbitrary arguments
|
| 241 |
+
and forwards them directly to the model's inference method.
|
| 242 |
+
"""
|
| 243 |
+
return self.model.infer(
|
| 244 |
+
x,
|
| 245 |
+
x_lengths,
|
| 246 |
+
sid,
|
| 247 |
+
tone,
|
| 248 |
+
language,
|
| 249 |
+
bert,
|
| 250 |
+
ja_bert,
|
| 251 |
+
sdp_ratio=sdp_ratio,
|
| 252 |
+
noise_scale=noise_scale,
|
| 253 |
+
noise_scale_w=noise_scale_w,
|
| 254 |
+
length_scale=length_scale,
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
def get_example_input(self):
|
| 258 |
+
"""
|
| 259 |
+
Return a tuple of example inputs for tracing/ONNX exporting or debugging.
|
| 260 |
+
When exporting the SynthesizerTrn function,
|
| 261 |
+
This model has been found to be very sensitive to the example_input used for model transformation.
|
| 262 |
+
Here, we have implemented some simple rules or considered using real input data.
|
| 263 |
+
"""
|
| 264 |
+
|
| 265 |
+
def gen_interleaved_random_tensor(length, value_range):
|
| 266 |
+
"""Generate a Tensor in the format [0, val, 0, val, ..., 0], val ∈ [low, high)."""
|
| 267 |
+
return torch.tensor([[0 if i % 2 == 0 else torch.randint(*value_range, (1,)).item() for i in range(length)]], dtype=torch.int64).to(pt_device)
|
| 268 |
+
|
| 269 |
+
def gen_interleaved_fixed_tensor(length, fixed_value):
|
| 270 |
+
"""Generate a Tensor in the format [0, val, 0, val, ..., 0]"""
|
| 271 |
+
interleaved = [0 if i % 2 == 0 else fixed_value for i in range(length)]
|
| 272 |
+
return torch.tensor([interleaved], dtype=torch.int64).to(pt_device)
|
| 273 |
+
|
| 274 |
+
if self.language == "EN_NEWEST":
|
| 275 |
+
seq_len = 73
|
| 276 |
+
x_tst = gen_interleaved_random_tensor(seq_len, (14, 220))
|
| 277 |
+
x_tst[:3] = 0
|
| 278 |
+
x_tst[-3:] = 0
|
| 279 |
+
x_tst_lengths = torch.tensor([seq_len], dtype=torch.int64).to(pt_device)
|
| 280 |
+
speakers = torch.tensor([0], dtype=torch.int64).to(pt_device) # This model has only one fixed id for speakers.
|
| 281 |
+
tones = gen_interleaved_random_tensor(seq_len, (5, 10))
|
| 282 |
+
lang_ids = gen_interleaved_fixed_tensor(seq_len, 2) # lang_id for english
|
| 283 |
+
bert = torch.randn((1, 1024, seq_len), dtype=torch.float32).to(pt_device)
|
| 284 |
+
ja_bert = torch.randn(1, 768, seq_len, dtype=torch.float32).to(pt_device)
|
| 285 |
+
sdp_ratio = torch.tensor(0.2).to(pt_device)
|
| 286 |
+
noise_scale = torch.tensor(0.6).to(pt_device)
|
| 287 |
+
noise_scale_w = torch.tensor(0.8).to(pt_device)
|
| 288 |
+
length_scale = torch.tensor(1.0).to(pt_device)
|
| 289 |
+
elif self.language == "ZH":
|
| 290 |
+
seq_len = 37
|
| 291 |
+
x_tst = gen_interleaved_random_tensor(seq_len, (7, 100))
|
| 292 |
+
x_tst[:3] = 0
|
| 293 |
+
x_tst[-3:] = 0
|
| 294 |
+
x_tst_lengths = torch.tensor([37], dtype=torch.int64).to(pt_device)
|
| 295 |
+
speakers = torch.tensor([1], dtype=torch.int64).to(pt_device) # This model has only one fixed id for speakers.
|
| 296 |
+
tones = gen_interleaved_random_tensor(seq_len, (4, 9))
|
| 297 |
+
lang_ids = gen_interleaved_fixed_tensor(seq_len, 3) # lang_id for chinese
|
| 298 |
+
bert = torch.zeros((1, 1024, 37), dtype=torch.float32).to(pt_device)
|
| 299 |
+
ja_bert = torch.randn(1, 768, 37).float().to(pt_device)
|
| 300 |
+
sdp_ratio = torch.tensor(0.2).to(pt_device)
|
| 301 |
+
noise_scale = torch.tensor(0.6).to(pt_device)
|
| 302 |
+
noise_scale_w = torch.tensor(0.8).to(pt_device)
|
| 303 |
+
length_scale = torch.tensor(1.0).to(pt_device)
|
| 304 |
+
return (
|
| 305 |
+
x_tst,
|
| 306 |
+
x_tst_lengths,
|
| 307 |
+
speakers,
|
| 308 |
+
tones,
|
| 309 |
+
lang_ids,
|
| 310 |
+
bert,
|
| 311 |
+
ja_bert,
|
| 312 |
+
noise_scale,
|
| 313 |
+
length_scale,
|
| 314 |
+
noise_scale_w,
|
| 315 |
+
sdp_ratio,
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
class OVOpenVoiceConverter(torch.nn.Module):
|
| 320 |
+
def __init__(self, voice_model: OpenVoiceBaseClass):
|
| 321 |
+
super().__init__()
|
| 322 |
+
self.voice_model = voice_model
|
| 323 |
+
for par in voice_model.model.parameters():
|
| 324 |
+
par.requires_grad = False
|
| 325 |
+
|
| 326 |
+
def get_example_input(self):
|
| 327 |
+
y = torch.randn([1, 513, 238], dtype=torch.float32)
|
| 328 |
+
y_lengths = torch.LongTensor([y.size(-1)])
|
| 329 |
+
target_se = torch.randn(*(1, 256, 1))
|
| 330 |
+
source_se = torch.randn(*(1, 256, 1))
|
| 331 |
+
tau = torch.tensor(0.3)
|
| 332 |
+
return (y, y_lengths, source_se, target_se, tau)
|
| 333 |
+
|
| 334 |
+
def forward(self, y, y_lengths, sid_src, sid_tgt, tau):
|
| 335 |
+
"""
|
| 336 |
+
wraps the 'voice_conversion' method with forward.
|
| 337 |
+
"""
|
| 338 |
+
return self.voice_model.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, tau)
|
| 339 |
+
|
| 340 |
+
pt_device = "cpu"
|
| 341 |
+
|
| 342 |
+
melo_tts_en_newest = TTS(
|
| 343 |
+
"EN_NEWEST",
|
| 344 |
+
pt_device,
|
| 345 |
+
use_hf=False,
|
| 346 |
+
config_path=melotts_english_suffix / "config.json",
|
| 347 |
+
ckpt_path=melotts_english_suffix / "checkpoint.pth",
|
| 348 |
+
)
|
| 349 |
+
melo_tts_zh = TTS(
|
| 350 |
+
"ZH",
|
| 351 |
+
pt_device,
|
| 352 |
+
use_hf=False,
|
| 353 |
+
config_path=melotts_chinese_suffix / "config.json",
|
| 354 |
+
ckpt_path=melotts_chinese_suffix / "checkpoint.pth",
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
tone_color_converter = ToneColorConverter(converter_suffix / "config.json", device=pt_device)
|
| 358 |
+
tone_color_converter.load_ckpt(converter_suffix / "checkpoint.pth")
|
| 359 |
+
print(f"ToneColorConverter version: {tone_color_converter.version}")
|
| 360 |
+
|
| 361 |
+
import nncf
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
IRS_PATH = Path("openvino_irs/")
|
| 365 |
+
EN_TTS_IR = IRS_PATH / "melo_tts_en_newest.xml"
|
| 366 |
+
ZH_TTS_IR = IRS_PATH / "melo_tts_zh.xml"
|
| 367 |
+
VOICE_CONVERTER_IR = IRS_PATH / "openvoice2_tone_conversion.xml"
|
| 368 |
+
|
| 369 |
+
paths = [EN_TTS_IR, ZH_TTS_IR, VOICE_CONVERTER_IR]
|
| 370 |
+
models = [
|
| 371 |
+
OVSynthesizerTTSWrapper(melo_tts_en_newest.model, "EN_NEWEST"),
|
| 372 |
+
OVSynthesizerTTSWrapper(melo_tts_zh.model, "ZH"),
|
| 373 |
+
OVOpenVoiceConverter(tone_color_converter),
|
| 374 |
+
]
|
| 375 |
+
|
| 376 |
+
ov_models = []
|
| 377 |
+
|
| 378 |
+
for model, path in zip(models, paths):
|
| 379 |
+
if not path.exists():
|
| 380 |
+
ov_model = ov.convert_model(model, example_input=model.get_example_input())
|
| 381 |
+
ov_model = nncf.compress_weights(ov_model)
|
| 382 |
+
ov.save_model(ov_model, path)
|
| 383 |
+
else:
|
| 384 |
+
ov_model = core.read_model(path)
|
| 385 |
+
ov_models.append(ov_model)
|
| 386 |
+
|
| 387 |
+
ov_en_tts, ov_zh_tts, ov_voice_conversion = ov_models
|
| 388 |
+
|
| 389 |
+
core = ov.Core()
|
| 390 |
+
|
| 391 |
+
device = device_widget("CPU", exclude=["NPU"])
|
| 392 |
+
device
|
| 393 |
+
|
| 394 |
+
REFERENCE_VOICES_PATH = f"{repo_dir}/resources/"
|
| 395 |
+
reference_speakers = [
|
| 396 |
+
*[path for path in os.listdir(REFERENCE_VOICES_PATH) if os.path.splitext(path)[-1] == ".mp3"],
|
| 397 |
+
"record_manually",
|
| 398 |
+
"load_manually",
|
| 399 |
+
]
|
| 400 |
+
|
| 401 |
+
ref_speaker = widgets.Dropdown(
|
| 402 |
+
options=reference_speakers,
|
| 403 |
+
value=reference_speakers[0],
|
| 404 |
+
description="reference voice from which tone color will be copied",
|
| 405 |
+
disabled=False,
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
ref_speaker
|
| 409 |
+
|
| 410 |
+
OUTPUT_DIR = Path("outputs/")
|
| 411 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 412 |
+
|
| 413 |
+
ref_speaker_path = f"{REFERENCE_VOICES_PATH}/{ref_speaker.value}"
|
| 414 |
+
allowed_audio_types = ".mp4,.mp3,.wav,.wma,.aac,.m4a,.m4b,.webm"
|
| 415 |
+
|
| 416 |
+
if ref_speaker.value == "record_manually":
|
| 417 |
+
ref_speaker_path = OUTPUT_DIR / "custom_example_sample.webm"
|
| 418 |
+
from ipywebrtc import AudioRecorder, CameraStream
|
| 419 |
+
|
| 420 |
+
camera = CameraStream(constraints={"audio": True, "video": False})
|
| 421 |
+
recorder = AudioRecorder(stream=camera, filename=ref_speaker_path, autosave=True)
|
| 422 |
+
display(recorder)
|
| 423 |
+
elif ref_speaker.value == "load_manually":
|
| 424 |
+
upload_ref = widgets.FileUpload(
|
| 425 |
+
accept=allowed_audio_types,
|
| 426 |
+
multiple=False,
|
| 427 |
+
description="Select audio with reference voice",
|
| 428 |
+
)
|
| 429 |
+
display(upload_ref)
|
| 430 |
+
|
| 431 |
+
def save_audio(voice_source: widgets.FileUpload, out_path: str):
|
| 432 |
+
with open(out_path, "wb") as output_file:
|
| 433 |
+
assert len(voice_source.value) > 0, "Please select audio file"
|
| 434 |
+
output_file.write(voice_source.value[0]["content"])
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
if ref_speaker.value == "load_manually":
|
| 438 |
+
ref_speaker_path = f"{OUTPUT_DIR}/{upload_ref.value[0].name}"
|
| 439 |
+
save_audio(upload_ref, ref_speaker_path)
|
| 440 |
+
|
| 441 |
+
Audio(ref_speaker_path)
|
| 442 |
+
|
| 443 |
+
# Commented out IPython magic to ensure Python compatibility.
|
| 444 |
+
|
| 445 |
+
torch_hub_local = Path("torch_hub_local/")
|
| 446 |
+
# %env TORCH_HOME={str(torch_hub_local.absolute())}
|
| 447 |
+
|
| 448 |
+
# second step to fix a problem with silero downloading and installing
|
| 449 |
+
import os
|
| 450 |
+
import zipfile
|
| 451 |
+
|
| 452 |
+
url = "https://github.com/snakers4/silero-vad/zipball/v3.0"
|
| 453 |
+
|
| 454 |
+
torch_hub_dir = torch_hub_local / "hub"
|
| 455 |
+
torch.hub.set_dir(torch_hub_dir.as_posix())
|
| 456 |
+
|
| 457 |
+
zip_filename = "v3.0.zip"
|
| 458 |
+
output_path = torch_hub_dir / "v3.0"
|
| 459 |
+
if not (torch_hub_dir / zip_filename).exists():
|
| 460 |
+
download_file(url, directory=torch_hub_dir, filename=zip_filename)
|
| 461 |
+
zip_ref = zipfile.ZipFile((torch_hub_dir / zip_filename).as_posix(), "r")
|
| 462 |
+
zip_ref.extractall(path=output_path.as_posix())
|
| 463 |
+
zip_ref.close()
|
| 464 |
+
|
| 465 |
+
v3_dirs = [d for d in output_path.iterdir() if "snakers4-silero-vad" in d.as_posix()]
|
| 466 |
+
if len(v3_dirs) > 0 and not (torch_hub_dir / "snakers4_silero-vad_v3.0").exists():
|
| 467 |
+
v3_dir = str(v3_dirs[0])
|
| 468 |
+
os.rename(str(v3_dirs[0]), (torch_hub_dir / "snakers4_silero-vad_v3.0").as_posix())
|
| 469 |
+
|
| 470 |
+
en_source_newest_se = torch.load(base_speakers_suffix / "en-newest.pth")
|
| 471 |
+
zh_source_se = torch.load(base_speakers_suffix / "zh.pth")
|
| 472 |
+
|
| 473 |
+
target_se, audio_name = se_extractor.get_se(ref_speaker_path, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
|
| 474 |
+
|
| 475 |
+
def get_pathched_infer(ov_model: ov.Model, device: str) -> callable:
|
| 476 |
+
compiled_model = core.compile_model(ov_model, device)
|
| 477 |
+
|
| 478 |
+
def infer_impl(
|
| 479 |
+
x,
|
| 480 |
+
x_lengths,
|
| 481 |
+
sid,
|
| 482 |
+
tone,
|
| 483 |
+
language,
|
| 484 |
+
bert,
|
| 485 |
+
ja_bert,
|
| 486 |
+
noise_scale,
|
| 487 |
+
length_scale,
|
| 488 |
+
noise_scale_w,
|
| 489 |
+
max_len=None,
|
| 490 |
+
sdp_ratio=1.0,
|
| 491 |
+
y=None,
|
| 492 |
+
g=None,
|
| 493 |
+
):
|
| 494 |
+
ov_output = compiled_model(
|
| 495 |
+
(
|
| 496 |
+
x,
|
| 497 |
+
x_lengths,
|
| 498 |
+
sid,
|
| 499 |
+
tone,
|
| 500 |
+
language,
|
| 501 |
+
bert,
|
| 502 |
+
ja_bert,
|
| 503 |
+
noise_scale,
|
| 504 |
+
length_scale,
|
| 505 |
+
noise_scale_w,
|
| 506 |
+
sdp_ratio,
|
| 507 |
+
)
|
| 508 |
+
)
|
| 509 |
+
return (torch.tensor(ov_output[0]),)
|
| 510 |
+
|
| 511 |
+
return infer_impl
|
| 512 |
+
|
| 513 |
+
|
| 514 |
+
def get_patched_voice_conversion(ov_model: ov.Model, device: str) -> callable:
|
| 515 |
+
compiled_model = core.compile_model(ov_model, device)
|
| 516 |
+
|
| 517 |
+
def voice_conversion_impl(y, y_lengths, sid_src, sid_tgt, tau):
|
| 518 |
+
ov_output = compiled_model((y, y_lengths, sid_src, sid_tgt, tau))
|
| 519 |
+
return (torch.tensor(ov_output[0]),)
|
| 520 |
+
|
| 521 |
+
return voice_conversion_impl
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
melo_tts_en_newest.model.infer = get_pathched_infer(ov_en_tts, device.value)
|
| 525 |
+
melo_tts_zh.model.infer = get_pathched_infer(ov_zh_tts, device.value)
|
| 526 |
+
tone_color_converter.model.voice_conversion = get_patched_voice_conversion(ov_voice_conversion, device.value)
|
| 527 |
+
|
| 528 |
+
voice_source = widgets.Dropdown(
|
| 529 |
+
options=["use TTS", "choose_manually"],
|
| 530 |
+
value="use TTS",
|
| 531 |
+
description="Voice source",
|
| 532 |
+
disabled=False,
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
voice_source
|
| 536 |
+
|
| 537 |
+
if voice_source.value == "choose_manually":
|
| 538 |
+
upload_orig_voice = widgets.FileUpload(
|
| 539 |
+
accept=allowed_audio_types,
|
| 540 |
+
multiple=False,
|
| 541 |
+
description="audio whose tone will be replaced",
|
| 542 |
+
)
|
| 543 |
+
display(upload_orig_voice)
|
| 544 |
+
|
| 545 |
+
from IPython.display import Audio, display
|
| 546 |
+
|
| 547 |
+
if voice_source.value == "choose_manually":
|
| 548 |
+
orig_voice_path = f"{OUTPUT_DIR}/{upload_orig_voice.value[0].name}"
|
| 549 |
+
save_audio(upload_orig_voice, orig_voice_path)
|
| 550 |
+
source_se, _ = se_extractor.get_se(orig_voice_path, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
|
| 551 |
+
else:
|
| 552 |
+
en_text = """
|
| 553 |
+
I love going to school by bus
|
| 554 |
+
"""
|
| 555 |
+
# source_se = en_source_newest_se
|
| 556 |
+
en_orig_voice_path = OUTPUT_DIR / "output_ov_en-newest.wav"
|
| 557 |
+
print("use output_ov_en-newest.wav")
|
| 558 |
+
speaker_id = 0 # Choose the first speaker
|
| 559 |
+
melo_tts_en_newest.tts_to_file(en_text, speaker_id, en_orig_voice_path, speed=1.0)
|
| 560 |
+
zh_text = """
|
| 561 |
+
OpenVINO 是一个全面的开发工具集,旨在快速开发和部署各类应用程序及解决方案,可用于模仿人类视觉、自动语音识别、自然语言处理、
|
| 562 |
+
推荐系统等多种任务。
|
| 563 |
+
"""
|
| 564 |
+
# source_se = zh_source_se
|
| 565 |
+
zh_orig_voice_path = OUTPUT_DIR / "output_ov_zh.wav"
|
| 566 |
+
print("use output_ov_zh.wav")
|
| 567 |
+
speaker_id = 1 # Choose the first speaker
|
| 568 |
+
melo_tts_zh.tts_to_file(zh_text, speaker_id, zh_orig_voice_path, speed=1.0)
|
| 569 |
+
print("Playing English Original voice")
|
| 570 |
+
display(Audio(en_orig_voice_path))
|
| 571 |
+
print("Playing Chinese Original voice")
|
| 572 |
+
display(Audio(zh_orig_voice_path))
|
| 573 |
+
|
| 574 |
+
tau_slider = widgets.FloatSlider(
|
| 575 |
+
value=0.3,
|
| 576 |
+
min=0.01,
|
| 577 |
+
max=2.0,
|
| 578 |
+
step=0.01,
|
| 579 |
+
description="tau",
|
| 580 |
+
disabled=False,
|
| 581 |
+
readout_format=".2f",
|
| 582 |
+
)
|
| 583 |
+
tau_slider
|
| 584 |
+
|
| 585 |
+
from IPython.display import Audio, display
|
| 586 |
+
|
| 587 |
+
if voice_source.value == "choose_manually":
|
| 588 |
+
resulting_voice_path = OUTPUT_DIR / "output_ov_cloned.wav"
|
| 589 |
+
tone_color_converter.convert(
|
| 590 |
+
audio_src_path=orig_voice_path,
|
| 591 |
+
src_se=source_se,
|
| 592 |
+
tgt_se=target_se,
|
| 593 |
+
output_path=resulting_voice_path,
|
| 594 |
+
tau=tau_slider.value,
|
| 595 |
+
message="@MyShell",
|
| 596 |
+
)
|
| 597 |
+
print("Playing manually chosen cloned voice:")
|
| 598 |
+
display(Audio(resulting_voice_path))
|
| 599 |
+
else:
|
| 600 |
+
en_resulting_voice_path = OUTPUT_DIR / "output_ov_en-newest_cloned.wav"
|
| 601 |
+
zh_resulting_voice_path = OUTPUT_DIR / "output_ov_zh_cloned.wav"
|
| 602 |
+
|
| 603 |
+
tone_color_converter.convert(
|
| 604 |
+
audio_src_path=en_orig_voice_path,
|
| 605 |
+
src_se=en_source_newest_se,
|
| 606 |
+
tgt_se=target_se,
|
| 607 |
+
output_path=en_resulting_voice_path,
|
| 608 |
+
tau=tau_slider.value,
|
| 609 |
+
message="@MyShell",
|
| 610 |
+
)
|
| 611 |
+
tone_color_converter.convert(
|
| 612 |
+
audio_src_path=zh_orig_voice_path,
|
| 613 |
+
src_se=zh_source_se,
|
| 614 |
+
tgt_se=target_se,
|
| 615 |
+
output_path=zh_resulting_voice_path,
|
| 616 |
+
tau=tau_slider.value,
|
| 617 |
+
message="@MyShell",
|
| 618 |
+
)
|
| 619 |
+
print("Playing English cloned voice:")
|
| 620 |
+
display(Audio(en_resulting_voice_path))
|
| 621 |
+
print("Playing Chinese cloned voice:")
|
| 622 |
+
display(Audio(zh_resulting_voice_path))
|
| 623 |
+
|
| 624 |
+
import gradio as gr
|
| 625 |
+
import langid
|
| 626 |
+
|
| 627 |
+
supported_languages = ["zh", "en"]
|
| 628 |
+
supported_styles = {
|
| 629 |
+
"zh": "zh_default",
|
| 630 |
+
"en": [
|
| 631 |
+
"en_latest",
|
| 632 |
+
],
|
| 633 |
+
}
|
| 634 |
+
|
| 635 |
+
|
| 636 |
+
def predict_impl(
|
| 637 |
+
prompt,
|
| 638 |
+
style,
|
| 639 |
+
audio_file_pth,
|
| 640 |
+
agree,
|
| 641 |
+
output_dir,
|
| 642 |
+
tone_color_converter,
|
| 643 |
+
en_tts_model,
|
| 644 |
+
zh_tts_model,
|
| 645 |
+
en_source_se,
|
| 646 |
+
zh_source_se,
|
| 647 |
+
):
|
| 648 |
+
text_hint = ""
|
| 649 |
+
if not agree:
|
| 650 |
+
text_hint += "[ERROR] Please accept the Terms & Condition!\n"
|
| 651 |
+
gr.Warning("Please accept the Terms & Condition!")
|
| 652 |
+
return (
|
| 653 |
+
text_hint,
|
| 654 |
+
None,
|
| 655 |
+
None,
|
| 656 |
+
)
|
| 657 |
+
|
| 658 |
+
language_predicted = langid.classify(prompt)[0].strip()
|
| 659 |
+
|
| 660 |
+
if language_predicted not in supported_languages:
|
| 661 |
+
text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
|
| 662 |
+
gr.Warning(f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}")
|
| 663 |
+
|
| 664 |
+
return (
|
| 665 |
+
text_hint,
|
| 666 |
+
None,
|
| 667 |
+
None,
|
| 668 |
+
)
|
| 669 |
+
|
| 670 |
+
# check the style
|
| 671 |
+
if style not in supported_styles[language_predicted]:
|
| 672 |
+
text_hint += f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.\n"
|
| 673 |
+
gr.Warning(
|
| 674 |
+
f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior."
|
| 675 |
+
)
|
| 676 |
+
|
| 677 |
+
if len(prompt.split()) < 2:
|
| 678 |
+
text_hint += "[ERROR] Please give a longer prompt text \n"
|
| 679 |
+
gr.Warning("Please give a longer prompt text")
|
| 680 |
+
return (
|
| 681 |
+
text_hint,
|
| 682 |
+
None,
|
| 683 |
+
None,
|
| 684 |
+
)
|
| 685 |
+
if len(prompt.split()) > 50:
|
| 686 |
+
text_hint += "[ERROR] Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749 \n"
|
| 687 |
+
gr.Warning(
|
| 688 |
+
"Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749"
|
| 689 |
+
)
|
| 690 |
+
return (
|
| 691 |
+
text_hint,
|
| 692 |
+
None,
|
| 693 |
+
None,
|
| 694 |
+
)
|
| 695 |
+
|
| 696 |
+
speaker_wav = audio_file_pth
|
| 697 |
+
|
| 698 |
+
if language_predicted == "zh":
|
| 699 |
+
tts_model = zh_tts_model
|
| 700 |
+
if zh_tts_model is None:
|
| 701 |
+
gr.Warning("TTS model for Chinece language was not loaded")
|
| 702 |
+
return (
|
| 703 |
+
text_hint,
|
| 704 |
+
None,
|
| 705 |
+
None,
|
| 706 |
+
)
|
| 707 |
+
source_se = zh_source_se
|
| 708 |
+
speaker_id = 1
|
| 709 |
+
|
| 710 |
+
else:
|
| 711 |
+
tts_model = en_tts_model
|
| 712 |
+
if en_tts_model is None:
|
| 713 |
+
gr.Warning("TTS model for English language was not loaded")
|
| 714 |
+
return (
|
| 715 |
+
text_hint,
|
| 716 |
+
None,
|
| 717 |
+
None,
|
| 718 |
+
)
|
| 719 |
+
source_se = en_source_se
|
| 720 |
+
speaker_id = 0
|
| 721 |
+
|
| 722 |
+
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
|
| 723 |
+
try:
|
| 724 |
+
target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
|
| 725 |
+
except Exception as e:
|
| 726 |
+
text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
|
| 727 |
+
gr.Warning("[ERROR] Get target tone color error {str(e)} \n")
|
| 728 |
+
return (
|
| 729 |
+
text_hint,
|
| 730 |
+
None,
|
| 731 |
+
None,
|
| 732 |
+
)
|
| 733 |
+
|
| 734 |
+
src_path = f"{output_dir}/tmp.wav"
|
| 735 |
+
tts_model.tts_to_file(prompt, speaker_id, src_path, speed=1.0)
|
| 736 |
+
|
| 737 |
+
if tone_color_converter is None or source_se is None:
|
| 738 |
+
gr.Warning("Tone Color Converter model was not loaded")
|
| 739 |
+
return (
|
| 740 |
+
text_hint,
|
| 741 |
+
None,
|
| 742 |
+
None,
|
| 743 |
+
)
|
| 744 |
+
save_path = f"{output_dir}/output.wav"
|
| 745 |
+
encode_message = "@MyShell"
|
| 746 |
+
tone_color_converter.convert(
|
| 747 |
+
audio_src_path=src_path,
|
| 748 |
+
src_se=source_se,
|
| 749 |
+
tgt_se=target_se,
|
| 750 |
+
output_path=save_path,
|
| 751 |
+
tau=0.3,
|
| 752 |
+
message=encode_message,
|
| 753 |
+
)
|
| 754 |
+
|
| 755 |
+
text_hint += "Get response successfully \n"
|
| 756 |
+
|
| 757 |
+
return (
|
| 758 |
+
text_hint,
|
| 759 |
+
src_path,
|
| 760 |
+
save_path,
|
| 761 |
+
)
|
| 762 |
+
|
| 763 |
+
from functools import partial
|
| 764 |
+
|
| 765 |
+
|
| 766 |
+
predict = partial(
|
| 767 |
+
predict_impl,
|
| 768 |
+
output_dir=OUTPUT_DIR,
|
| 769 |
+
tone_color_converter=tone_color_converter,
|
| 770 |
+
en_tts_model=melo_tts_en_newest,
|
| 771 |
+
zh_tts_model=melo_tts_zh,
|
| 772 |
+
en_source_se=en_source_newest_se,
|
| 773 |
+
zh_source_se=zh_source_se,
|
| 774 |
+
)
|
| 775 |
+
|
| 776 |
+
import sys
|
| 777 |
+
|
| 778 |
+
if "gradio_helper" in sys.modules:
|
| 779 |
+
del sys.modules["gradio_helper"]
|
| 780 |
+
|
| 781 |
+
if not Path("gradio_helper.py").exists():
|
| 782 |
+
r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/openvoice/gradio_helper.py")
|
| 783 |
+
open("gradio_helper.py", "w").write(r.text)
|
| 784 |
+
|
| 785 |
+
from gradio_helper import make_demo
|
| 786 |
+
|
| 787 |
+
demo = make_demo(fn=predict)
|
| 788 |
+
|
| 789 |
+
# demo.queue(max_size=1).launch(share=True, debug=True, height=1000)
|
| 790 |
+
|
| 791 |
+
demo.queue(max_size=1).launch(server_name="0.0.0.0", server_port=7860)
|
| 792 |
+
|
| 793 |
+
# try:
|
| 794 |
+
# demo.queue(max_size=1).launch(debug=True, height=1000)
|
| 795 |
+
# except Exception:
|
| 796 |
+
# demo.queue(max_size=1).launch(share=True, debug=True, height=1000)
|
| 797 |
+
# if you are launching remotely, specify server_name and server_port
|
| 798 |
+
# demo.launch(server_name='your server name', server_port='server port in int')
|
| 799 |
+
# Read more in the docs: https://gradio.app/docs/
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
requests
|