Spaces:
Runtime error
Runtime error
glide-the
commited on
Commit
·
8953210
1
Parent(s):
ffe5bd7
Add large files to Git LFS
Browse files- Dockerfile +21 -0
- README.md +11 -0
- requirements.txt +52 -0
- setup.py +36 -0
- start.py +4 -0
- util.py +107 -0
Dockerfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10
|
| 2 |
+
|
| 3 |
+
RUN apt update && apt install -y cmake gcc portaudio19-dev
|
| 4 |
+
|
| 5 |
+
WORKDIR /code
|
| 6 |
+
ENV NUMBA_CACHE_DIR=/tmp/
|
| 7 |
+
COPY ./requirements.txt /code/requirements.txt
|
| 8 |
+
|
| 9 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
| 10 |
+
COPY . /code/
|
| 11 |
+
|
| 12 |
+
RUN pip install -e .
|
| 13 |
+
|
| 14 |
+
RUN cd /code/vits/monotonic_align && \
|
| 15 |
+
mkdir -p /code/vits/monotonic_align/vits/monotonic_align/ && \
|
| 16 |
+
python setup.py build_ext --inplace && \
|
| 17 |
+
mv /code/vits/monotonic_align/vits/monotonic_align/* /code/vits/monotonic_align/
|
| 18 |
+
|
| 19 |
+
CMD ["python", "-m", "speakers", "--verbose", "--mode", "web"]
|
| 20 |
+
|
| 21 |
+
EXPOSE 7860
|
README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: RVC Speakers
|
| 3 |
+
emoji: 📚
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: bsd-3-clause
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
requirements.txt
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Cython==0.29.21
|
| 2 |
+
# vits_text
|
| 3 |
+
unidecode
|
| 4 |
+
# pre uninstall cmake
|
| 5 |
+
pyopenjtalk
|
| 6 |
+
jamo
|
| 7 |
+
pypinyin
|
| 8 |
+
jieba
|
| 9 |
+
cn2an
|
| 10 |
+
# vits_text
|
| 11 |
+
|
| 12 |
+
nest_asyncio
|
| 13 |
+
gradio==3.33.1
|
| 14 |
+
fairseq
|
| 15 |
+
torch
|
| 16 |
+
torchaudio
|
| 17 |
+
soundfile
|
| 18 |
+
scipy==1.9.3
|
| 19 |
+
librosa==0.9.1
|
| 20 |
+
musicdl
|
| 21 |
+
pyaudio
|
| 22 |
+
torchcrepe==0.0.20
|
| 23 |
+
praat-parselmouth>=0.4.2
|
| 24 |
+
pyworld==0.3.2
|
| 25 |
+
faiss-cpu==1.7.3
|
| 26 |
+
numpy==1.23.5
|
| 27 |
+
nltk
|
| 28 |
+
|
| 29 |
+
edge-tts
|
| 30 |
+
IPython
|
| 31 |
+
tqdm
|
| 32 |
+
pandas
|
| 33 |
+
|
| 34 |
+
## bark
|
| 35 |
+
transformers
|
| 36 |
+
encodec
|
| 37 |
+
huggingface-hub>=0.14.1
|
| 38 |
+
funcy
|
| 39 |
+
|
| 40 |
+
# config manage
|
| 41 |
+
omegaconf
|
| 42 |
+
pydantic
|
| 43 |
+
|
| 44 |
+
# log
|
| 45 |
+
colorama
|
| 46 |
+
|
| 47 |
+
# server
|
| 48 |
+
fastapi~=0.99.1
|
| 49 |
+
starlette~=0.27.0
|
| 50 |
+
uvicorn~=0.23.1
|
| 51 |
+
requests
|
| 52 |
+
oscrypto
|
setup.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Copyright (c) 2022, salesforce.com, inc.
|
| 3 |
+
All rights reserved.
|
| 4 |
+
SPDX-License-Identifier: BSD-3-Clause
|
| 5 |
+
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from setuptools import setup, find_namespace_packages
|
| 9 |
+
import platform
|
| 10 |
+
|
| 11 |
+
DEPENDENCY_LINKS = []
|
| 12 |
+
if platform.system() == "Windows":
|
| 13 |
+
DEPENDENCY_LINKS.append("https://download.pytorch.org/whl/torch_stable.html")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def fetch_requirements(filename):
|
| 17 |
+
with open(filename) as f:
|
| 18 |
+
return [ln.strip() for ln in f.read().split("\n")]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
setup(
|
| 22 |
+
name="speakers",
|
| 23 |
+
version="0.0.1",
|
| 24 |
+
author="glide-the",
|
| 25 |
+
description="Ready Voice Controller , generate for End-to-End Text-to-Speech,with Multi-engine integration",
|
| 26 |
+
long_description=open("README.md", "r", encoding="utf-8").read(),
|
| 27 |
+
long_description_content_type="text/markdown",
|
| 28 |
+
keywords="Speakers, Multimodal, Ready Voice Controller",
|
| 29 |
+
license="3-Clause BSD",
|
| 30 |
+
packages=find_namespace_packages(include="speakers.*"),
|
| 31 |
+
install_requires=fetch_requirements("requirements.txt"),
|
| 32 |
+
python_requires=">=3.9.0",
|
| 33 |
+
include_package_data=True,
|
| 34 |
+
dependency_links=DEPENDENCY_LINKS,
|
| 35 |
+
zip_safe=False,
|
| 36 |
+
)
|
start.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from speakers.__main__ import main
|
| 2 |
+
|
| 3 |
+
if __name__ == '__main__':
|
| 4 |
+
main()
|
util.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import asyncio
|
| 3 |
+
from io import BytesIO
|
| 4 |
+
|
| 5 |
+
from fairseq import checkpoint_utils
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
import edge_tts
|
| 10 |
+
import librosa
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/config.py#L43-L55 # noqa
|
| 14 |
+
def has_mps() -> bool:
|
| 15 |
+
if sys.platform != "darwin":
|
| 16 |
+
return False
|
| 17 |
+
else:
|
| 18 |
+
if not getattr(torch, 'has_mps', False):
|
| 19 |
+
return False
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
torch.zeros(1).to(torch.device("mps"))
|
| 23 |
+
return True
|
| 24 |
+
except Exception:
|
| 25 |
+
return False
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def is_half(device: str) -> bool:
|
| 29 |
+
if not device.startswith('cuda'):
|
| 30 |
+
return False
|
| 31 |
+
else:
|
| 32 |
+
gpu_name = torch.cuda.get_device_name(
|
| 33 |
+
int(device.split(':')[-1])
|
| 34 |
+
).upper()
|
| 35 |
+
|
| 36 |
+
# ...regex?
|
| 37 |
+
if (
|
| 38 |
+
('16' in gpu_name and 'V100' not in gpu_name)
|
| 39 |
+
or 'P40' in gpu_name
|
| 40 |
+
or '1060' in gpu_name
|
| 41 |
+
or '1070' in gpu_name
|
| 42 |
+
or '1080' in gpu_name
|
| 43 |
+
):
|
| 44 |
+
return False
|
| 45 |
+
|
| 46 |
+
return True
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def load_hubert_model(device: str, model_path: str = 'hubert_base.pt'):
|
| 50 |
+
model = checkpoint_utils.load_model_ensemble_and_task(
|
| 51 |
+
[model_path]
|
| 52 |
+
)[0][0].to(device)
|
| 53 |
+
|
| 54 |
+
if is_half(device):
|
| 55 |
+
return model.half()
|
| 56 |
+
else:
|
| 57 |
+
return model.float()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
async def call_edge_tts(speaker_name: str, text: str):
|
| 61 |
+
tts_com = edge_tts.Communicate(text, speaker_name)
|
| 62 |
+
tts_raw = b''
|
| 63 |
+
|
| 64 |
+
# Stream TTS audio to bytes
|
| 65 |
+
async for chunk in tts_com.stream():
|
| 66 |
+
if chunk['type'] == 'audio':
|
| 67 |
+
tts_raw += chunk['data']
|
| 68 |
+
|
| 69 |
+
# Convert mp3 stream to wav
|
| 70 |
+
ffmpeg_proc = await asyncio.create_subprocess_exec(
|
| 71 |
+
'ffmpeg',
|
| 72 |
+
'-f', 'mp3',
|
| 73 |
+
'-i', '-',
|
| 74 |
+
'-f', 'wav',
|
| 75 |
+
'-loglevel', 'error',
|
| 76 |
+
'-',
|
| 77 |
+
stdin=asyncio.subprocess.PIPE,
|
| 78 |
+
stdout=asyncio.subprocess.PIPE
|
| 79 |
+
)
|
| 80 |
+
(tts_wav, _) = await ffmpeg_proc.communicate(tts_raw)
|
| 81 |
+
|
| 82 |
+
return librosa.load(BytesIO(tts_wav))
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
async def call_edge_tts_config(speaker_name: str, text: str, rate: str, volume: str):
|
| 86 |
+
tts_com = edge_tts.Communicate(text=text, voice=speaker_name, rate=rate, volume=volume)
|
| 87 |
+
tts_raw = b''
|
| 88 |
+
|
| 89 |
+
# Stream TTS audio to bytes
|
| 90 |
+
async for chunk in tts_com.stream():
|
| 91 |
+
if chunk['type'] == 'audio':
|
| 92 |
+
tts_raw += chunk['data']
|
| 93 |
+
|
| 94 |
+
# Convert mp3 stream to wav
|
| 95 |
+
ffmpeg_proc = await asyncio.create_subprocess_exec(
|
| 96 |
+
'ffmpeg',
|
| 97 |
+
'-f', 'mp3',
|
| 98 |
+
'-i', '-',
|
| 99 |
+
'-f', 'wav',
|
| 100 |
+
'-loglevel', 'error',
|
| 101 |
+
'-',
|
| 102 |
+
stdin=asyncio.subprocess.PIPE,
|
| 103 |
+
stdout=asyncio.subprocess.PIPE
|
| 104 |
+
)
|
| 105 |
+
(tts_wav, _) = await ffmpeg_proc.communicate(tts_raw)
|
| 106 |
+
|
| 107 |
+
return librosa.load(BytesIO(tts_wav))
|