Spaces:
Runtime error
Runtime error
Duplicate from konverner/deep-voice-cloning
Browse filesCo-authored-by: Konstantin Verner <[email protected]>
- .gitignore +169 -0
- Dockerfile +4 -0
- LICENSE +21 -0
- README.md +10 -0
- app.py +29 -0
- build/lib/deep_voice_cloning/__init__.py +0 -0
- build/lib/deep_voice_cloning/cloning/__init__.py +0 -0
- build/lib/deep_voice_cloning/cloning/config.json +7 -0
- build/lib/deep_voice_cloning/cloning/model.py +57 -0
- build/lib/deep_voice_cloning/data/__init__.py +0 -0
- build/lib/deep_voice_cloning/data/collator.py +45 -0
- build/lib/deep_voice_cloning/data/dataset.py +63 -0
- build/lib/deep_voice_cloning/transcriber/__init__.py +0 -0
- build/lib/deep_voice_cloning/transcriber/config.json +7 -0
- build/lib/deep_voice_cloning/transcriber/model.py +22 -0
- models/.gitkeep +0 -0
- notebooks/.gitkeep +0 -0
- notebooks/CLI_Example.ipynb +0 -0
- pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/classifier.ckpt +1 -0
- pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/embedding_model.ckpt +1 -0
- pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/hyperparams.yaml +1 -0
- pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/label_encoder.ckpt +1 -0
- pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/mean_var_norm_emb.ckpt +1 -0
- requirements.txt +64 -0
- scripts/cloning_inference.py +30 -0
- scripts/inference_config.json +7 -0
- scripts/input/hank.mp3 +0 -0
- scripts/input/homer.mp3 +0 -0
- scripts/output/.gitkeep +0 -0
- scripts/train.py +71 -0
- scripts/training_config.json +9 -0
- setup.py +106 -0
- src/deep_voice_cloning/__init__.py +0 -0
- src/deep_voice_cloning/cloning/__init__.py +0 -0
- src/deep_voice_cloning/cloning/config.json +7 -0
- src/deep_voice_cloning/cloning/model.py +57 -0
- src/deep_voice_cloning/data/__init__.py +0 -0
- src/deep_voice_cloning/data/collator.py +45 -0
- src/deep_voice_cloning/data/dataset.py +63 -0
- src/deep_voice_cloning/transcriber/__init__.py +0 -0
- src/deep_voice_cloning/transcriber/config.json +7 -0
- src/deep_voice_cloning/transcriber/model.py +22 -0
.gitignore
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Initially taken from Github's Python gitignore file
|
2 |
+
|
3 |
+
# Byte-compiled / optimized / DLL files
|
4 |
+
__pycache__/
|
5 |
+
*.py[cod]
|
6 |
+
*$py.class
|
7 |
+
|
8 |
+
# C extensions
|
9 |
+
*.so
|
10 |
+
|
11 |
+
# tests and logs
|
12 |
+
tests/fixtures/cached_*_text.txt
|
13 |
+
logs/
|
14 |
+
lightning_logs/
|
15 |
+
lang_code_data/
|
16 |
+
|
17 |
+
# Distribution / packaging
|
18 |
+
.Python
|
19 |
+
build/
|
20 |
+
develop-eggs/
|
21 |
+
dist/
|
22 |
+
downloads/
|
23 |
+
eggs/
|
24 |
+
.eggs/
|
25 |
+
lib/
|
26 |
+
lib64/
|
27 |
+
parts/
|
28 |
+
sdist/
|
29 |
+
var/
|
30 |
+
wheels/
|
31 |
+
*.egg-info/
|
32 |
+
.installed.cfg
|
33 |
+
*.egg
|
34 |
+
MANIFEST
|
35 |
+
|
36 |
+
# PyInstaller
|
37 |
+
# Usually these files are written by a python script from a template
|
38 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
39 |
+
*.manifest
|
40 |
+
*.spec
|
41 |
+
|
42 |
+
# Installer logs
|
43 |
+
pip-log.txt
|
44 |
+
pip-delete-this-directory.txt
|
45 |
+
|
46 |
+
# Unit test / coverage reports
|
47 |
+
htmlcov/
|
48 |
+
.tox/
|
49 |
+
.nox/
|
50 |
+
.coverage
|
51 |
+
.coverage.*
|
52 |
+
.cache
|
53 |
+
nosetests.xml
|
54 |
+
coverage.xml
|
55 |
+
*.cover
|
56 |
+
.hypothesis/
|
57 |
+
.pytest_cache/
|
58 |
+
|
59 |
+
# Translations
|
60 |
+
*.mo
|
61 |
+
*.pot
|
62 |
+
|
63 |
+
# Django stuff:
|
64 |
+
*.log
|
65 |
+
local_settings.py
|
66 |
+
db.sqlite3
|
67 |
+
|
68 |
+
# Flask stuff:
|
69 |
+
instance/
|
70 |
+
.webassets-cache
|
71 |
+
|
72 |
+
# Scrapy stuff:
|
73 |
+
.scrapy
|
74 |
+
|
75 |
+
# Sphinx documentation
|
76 |
+
docs/_build/
|
77 |
+
|
78 |
+
# PyBuilder
|
79 |
+
target/
|
80 |
+
|
81 |
+
# Jupyter Notebook
|
82 |
+
.ipynb_checkpoints
|
83 |
+
|
84 |
+
# IPython
|
85 |
+
profile_default/
|
86 |
+
ipython_config.py
|
87 |
+
|
88 |
+
# pyenv
|
89 |
+
.python-version
|
90 |
+
|
91 |
+
# celery beat schedule file
|
92 |
+
celerybeat-schedule
|
93 |
+
|
94 |
+
# SageMath parsed files
|
95 |
+
*.sage.py
|
96 |
+
|
97 |
+
# Environments
|
98 |
+
.env
|
99 |
+
.venv
|
100 |
+
env/
|
101 |
+
venv/
|
102 |
+
ENV/
|
103 |
+
env.bak/
|
104 |
+
venv.bak/
|
105 |
+
|
106 |
+
# Spyder project settings
|
107 |
+
.spyderproject
|
108 |
+
.spyproject
|
109 |
+
|
110 |
+
# Rope project settings
|
111 |
+
.ropeproject
|
112 |
+
|
113 |
+
# mkdocs documentation
|
114 |
+
/site
|
115 |
+
|
116 |
+
# mypy
|
117 |
+
.mypy_cache/
|
118 |
+
.dmypy.json
|
119 |
+
dmypy.json
|
120 |
+
|
121 |
+
# Pyre type checker
|
122 |
+
.pyre/
|
123 |
+
|
124 |
+
# vscode
|
125 |
+
.vs
|
126 |
+
.vscode
|
127 |
+
|
128 |
+
# Pycharm
|
129 |
+
.idea
|
130 |
+
|
131 |
+
# TF code
|
132 |
+
tensorflow_code
|
133 |
+
|
134 |
+
# Models
|
135 |
+
proc_data
|
136 |
+
|
137 |
+
# examples
|
138 |
+
runs
|
139 |
+
/runs_old
|
140 |
+
/wandb
|
141 |
+
/examples/runs
|
142 |
+
/examples/**/*.args
|
143 |
+
/examples/rag/sweep
|
144 |
+
|
145 |
+
# data
|
146 |
+
/data
|
147 |
+
serialization_dir
|
148 |
+
|
149 |
+
# emacs
|
150 |
+
*.*~
|
151 |
+
debug.env
|
152 |
+
|
153 |
+
# vim
|
154 |
+
.*.swp
|
155 |
+
|
156 |
+
#ctags
|
157 |
+
tags
|
158 |
+
|
159 |
+
# pre-commit
|
160 |
+
.pre-commit*
|
161 |
+
|
162 |
+
# .lock
|
163 |
+
*.lock
|
164 |
+
|
165 |
+
# DS_Store (MacOS)
|
166 |
+
.DS_Store
|
167 |
+
|
168 |
+
# ruff
|
169 |
+
.ruff_cache
|
Dockerfile
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
MAINTAINER Konstantin Verner <[email protected]>
|
3 |
+
COPY . .
|
4 |
+
RUN pip install .
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Konstantin Verner
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: openrail
|
3 |
+
title: Deep Voice Cloning
|
4 |
+
sdk: gradio
|
5 |
+
emoji: 🌖
|
6 |
+
colorFrom: yellow
|
7 |
+
colorTo: purple
|
8 |
+
pinned: true
|
9 |
+
duplicated_from: konverner/deep-voice-cloning
|
10 |
+
---
|
app.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
|
7 |
+
os.system('pip install .')
|
8 |
+
|
9 |
+
def greet(text, audio_file_path, progress=gr.Progress()):
|
10 |
+
text = "%s" % text
|
11 |
+
audio_file_path = "%s" % audio_file_path
|
12 |
+
out_path = Path("scripts/output/audio.wav")
|
13 |
+
progress(0.2, desc="Training voice embedding... (aprx 20 mins)")
|
14 |
+
os.system(f'python scripts/train.py --audio_path {audio_file_path}\
|
15 |
+
--output_dir "models"')
|
16 |
+
progress(0.9, desc="Generating voice...")
|
17 |
+
os.system(f'python scripts/cloning_inference.py --model_path "models/microsoft_speecht5_tts_{Path(audio_file_path).stem}"\
|
18 |
+
--input_text "{text}" --output_path "{str(out_path)}"')
|
19 |
+
return out_path
|
20 |
+
|
21 |
+
|
22 |
+
demo = gr.Interface(
|
23 |
+
fn=greet,
|
24 |
+
inputs=[gr.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),
|
25 |
+
gr.Audio(type="filepath", source="upload", label='Upload a voice to clone (max. 50mb)')],
|
26 |
+
outputs="audio",
|
27 |
+
title="Deep Voice Cloning Tool"
|
28 |
+
)
|
29 |
+
demo.launch()
|
build/lib/deep_voice_cloning/__init__.py
ADDED
File without changes
|
build/lib/deep_voice_cloning/cloning/__init__.py
ADDED
File without changes
|
build/lib/deep_voice_cloning/cloning/config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"en": {
|
3 |
+
"model_path": "microsoft/speecht5_tts",
|
4 |
+
"vocoder_name": "microsoft/speecht5_hifigan",
|
5 |
+
"speaker_model_name": "speechbrain/spkrec-xvect-voxceleb"
|
6 |
+
}
|
7 |
+
}
|
build/lib/deep_voice_cloning/cloning/model.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from typing import Dict
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import torch
|
8 |
+
from speechbrain.pretrained import EncoderClassifier
|
9 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
10 |
+
|
11 |
+
|
12 |
+
class CloningModel:
|
13 |
+
def __init__(self, config: Dict[str, Dict[str, str]] = None, lang: str = 'en'):
|
14 |
+
super(CloningModel, self).__init__()
|
15 |
+
if config is None:
|
16 |
+
self.speaker_embedding = None
|
17 |
+
with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
|
18 |
+
self.config = json.load(f)[lang]
|
19 |
+
else:
|
20 |
+
self.config = config
|
21 |
+
self.speaker_embedding = torch.load(Path(self.config['model_path']) / "speaker_embedding.pt")[0]
|
22 |
+
self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
|
23 |
+
self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
|
24 |
+
self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
|
25 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
26 |
+
self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
|
27 |
+
self.to(self.device)
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
def to(self, device: torch.device):
|
32 |
+
self.model = self.model.to(device)
|
33 |
+
self.vocoder = self.vocoder.to(device)
|
34 |
+
|
35 |
+
def save_pretrained(self, save_directory: str):
|
36 |
+
self.model.save_pretrained(save_directory)
|
37 |
+
self.processor.save_pretrained(save_directory)
|
38 |
+
torch.save(self.speaker_embedding, Path(save_directory) / "speaker_embedding.pt")
|
39 |
+
|
40 |
+
def forward(self, text: str) -> np.array:
|
41 |
+
# tokenize text
|
42 |
+
inputs = self.processor(text=text, return_tensors="pt")
|
43 |
+
# generate spectrogram using backbone model
|
44 |
+
spectrogram = self.model.generate_speech(inputs["input_ids"].to(self.device),
|
45 |
+
self.speaker_embedding.to(self.device))
|
46 |
+
# decode spectrogram into waveform using vocoder
|
47 |
+
with torch.no_grad():
|
48 |
+
waveform_array = self.vocoder(spectrogram).detach().cpu().numpy()
|
49 |
+
return waveform_array
|
50 |
+
|
51 |
+
def create_speaker_embedding(self, waveform: torch.tensor) -> torch.tensor:
|
52 |
+
with torch.no_grad():
|
53 |
+
speaker_embeddings = self.speaker_model.encode_batch(waveform)
|
54 |
+
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
|
55 |
+
self.speaker_embedding = speaker_embeddings
|
56 |
+
speaker_embeddings = speaker_embeddings.squeeze()
|
57 |
+
return speaker_embeddings
|
build/lib/deep_voice_cloning/data/__init__.py
ADDED
File without changes
|
build/lib/deep_voice_cloning/data/collator.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from typing import Any, Dict, List, Union
|
3 |
+
|
4 |
+
|
5 |
+
class TTSDataCollatorWithPadding:
|
6 |
+
|
7 |
+
def __init__(self, model, processor):
|
8 |
+
self.model = model
|
9 |
+
self.processor = processor
|
10 |
+
|
11 |
+
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
12 |
+
input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
|
13 |
+
label_features = [{"input_values": feature["labels"]} for feature in features]
|
14 |
+
speaker_features = [feature["speaker_embeddings"] for feature in features]
|
15 |
+
|
16 |
+
# collate the inputs and targets into a batch
|
17 |
+
batch = self.processor.pad(
|
18 |
+
input_ids=input_ids,
|
19 |
+
labels=label_features,
|
20 |
+
return_tensors="pt",
|
21 |
+
)
|
22 |
+
|
23 |
+
# replace padding with -100 to ignore loss correctly
|
24 |
+
batch["labels"] = batch["labels"].masked_fill(
|
25 |
+
batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
|
26 |
+
)
|
27 |
+
|
28 |
+
# not used during fine-tuning
|
29 |
+
del batch["decoder_attention_mask"]
|
30 |
+
|
31 |
+
# round down target lengths to multiple of reduction factor
|
32 |
+
if self.model.config.reduction_factor > 1:
|
33 |
+
target_lengths = torch.tensor([
|
34 |
+
len(feature["input_values"]) for feature in label_features
|
35 |
+
])
|
36 |
+
target_lengths = target_lengths.new([
|
37 |
+
length - length % self.model.config.reduction_factor for length in target_lengths
|
38 |
+
])
|
39 |
+
max_length = max(target_lengths)
|
40 |
+
batch["labels"] = batch["labels"][:, :max_length]
|
41 |
+
|
42 |
+
# add the speaker embeddings
|
43 |
+
batch["speaker_embeddings"] = torch.tensor(speaker_features)
|
44 |
+
|
45 |
+
return batch
|
build/lib/deep_voice_cloning/data/dataset.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import librosa
|
5 |
+
import numpy as np
|
6 |
+
from datasets import Dataset
|
7 |
+
|
8 |
+
from ..cloning.model import CloningModel
|
9 |
+
from ..transcriber.model import TranscriberModel
|
10 |
+
|
11 |
+
|
12 |
+
def prepare_dataset(example: Dict[str, Any], model: CloningModel) -> Dict[str, Any]:
|
13 |
+
"""
|
14 |
+
Prepare a single example for training
|
15 |
+
"""
|
16 |
+
# feature extraction and tokenization
|
17 |
+
processed_example = model.processor(
|
18 |
+
text=example["normalized_text"],
|
19 |
+
audio_target=example["audio"]["array"],
|
20 |
+
sampling_rate=16000,
|
21 |
+
return_attention_mask=False,
|
22 |
+
)
|
23 |
+
|
24 |
+
# strip off the batch dimension
|
25 |
+
if len(torch.tensor(processed_example['input_ids']).shape) > 1:
|
26 |
+
processed_example['input_ids'] = processed_example['input_ids'][0]
|
27 |
+
|
28 |
+
processed_example["labels"] = processed_example["labels"][0]
|
29 |
+
|
30 |
+
# use SpeechBrain to obtain x-vector
|
31 |
+
processed_example["speaker_embeddings"] = model.create_speaker_embedding(
|
32 |
+
torch.tensor(example["audio"]["array"])
|
33 |
+
).numpy()
|
34 |
+
|
35 |
+
return processed_example
|
36 |
+
|
37 |
+
|
38 |
+
def get_cloning_dataset(input_audio_path: str,
|
39 |
+
transcriber_model: TranscriberModel,
|
40 |
+
cloning_model: CloningModel,
|
41 |
+
sampling_rate: int = 16000,
|
42 |
+
window_size_secs: int = 5) -> Dataset:
|
43 |
+
"""
|
44 |
+
Create dataset by transcribing an audio file using a pretrained Wav2Vec2 model.
|
45 |
+
"""
|
46 |
+
speech_array, _ = librosa.load(input_audio_path, sr=sampling_rate)
|
47 |
+
|
48 |
+
# split a waveform into splits of 5 secs each
|
49 |
+
speech_arrays = np.split(speech_array, range(0, len(speech_array), window_size_secs * sampling_rate))[1:]
|
50 |
+
texts = [transcriber_model.forward(speech_array, sampling_rate=sampling_rate)
|
51 |
+
for speech_array in speech_arrays]
|
52 |
+
|
53 |
+
dataset = Dataset.from_list([
|
54 |
+
{'audio': {'array': speech_arrays[i]}, 'normalized_text': texts[i]}
|
55 |
+
for i in range(len(speech_arrays))]
|
56 |
+
)
|
57 |
+
|
58 |
+
dataset = dataset.map(
|
59 |
+
prepare_dataset, fn_kwargs={'model': cloning_model},
|
60 |
+
remove_columns=dataset.column_names,
|
61 |
+
)
|
62 |
+
|
63 |
+
return dataset
|
build/lib/deep_voice_cloning/transcriber/__init__.py
ADDED
File without changes
|
build/lib/deep_voice_cloning/transcriber/config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"language_model_names": {
|
3 |
+
"en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
|
4 |
+
"fr": "jonatasgrosman/wav2vec2-large-xlsr-53-french",
|
5 |
+
"de": "jonatasgrosman/wav2vec2-large-xlsr-53-german"
|
6 |
+
}
|
7 |
+
}
|
build/lib/deep_voice_cloning/transcriber/model.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
7 |
+
|
8 |
+
|
9 |
+
class TranscriberModel:
|
10 |
+
def __init__(self, lang: str = 'en'):
|
11 |
+
with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
|
12 |
+
config = json.load(f)
|
13 |
+
self.processor = Wav2Vec2Processor.from_pretrained(config['language_model_names'][lang])
|
14 |
+
self.model = Wav2Vec2ForCTC.from_pretrained(config['language_model_names'][lang])
|
15 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
16 |
+
|
17 |
+
def forward(self, speech_array: np.array, sampling_rate: int = 16000) -> str:
|
18 |
+
model_input = self.processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
|
19 |
+
with torch.no_grad():
|
20 |
+
logits = self.model(model_input.input_values, attention_mask=model_input.attention_mask).logits
|
21 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
22 |
+
return self.processor.batch_decode(predicted_ids)
|
models/.gitkeep
ADDED
File without changes
|
notebooks/.gitkeep
ADDED
File without changes
|
notebooks/CLI_Example.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/classifier.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/classifier.ckpt
|
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/embedding_model.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/embedding_model.ckpt
|
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/hyperparams.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/hyperparams.yaml
|
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/label_encoder.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/label_encoder.txt
|
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/mean_var_norm_emb.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/mean_var_norm_emb.ckpt
|
requirements.txt
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==0.21.0
|
2 |
+
aiohttp==3.8.4
|
3 |
+
aiosignal==1.3.1
|
4 |
+
appdirs==1.4.4
|
5 |
+
async-timeout==4.0.2
|
6 |
+
attrs==23.1.0
|
7 |
+
audioread==3.0.0
|
8 |
+
certifi==2023.5.7
|
9 |
+
cffi==1.15.1
|
10 |
+
charset-normalizer==3.2.0
|
11 |
+
colorama==0.4.6
|
12 |
+
datasets==2.13.1
|
13 |
+
decorator>=4.0.2
|
14 |
+
dill==0.3.6
|
15 |
+
filelock==3.12.2
|
16 |
+
frozenlist==1.4.0
|
17 |
+
fsspec==2023.6.0
|
18 |
+
huggingface-hub==0.16.4
|
19 |
+
HyperPyYAML==1.2.1
|
20 |
+
idna==3.4
|
21 |
+
Jinja2==3.1.2
|
22 |
+
joblib==1.3.1
|
23 |
+
lazy_loader==0.3
|
24 |
+
librosa==0.10.0.post2
|
25 |
+
llvmlite==0.40.1
|
26 |
+
MarkupSafe==2.1.3
|
27 |
+
mpmath==1.3.0
|
28 |
+
msgpack==1.0.5
|
29 |
+
multidict==6.0.4
|
30 |
+
multiprocess==0.70.14
|
31 |
+
networkx==3.1
|
32 |
+
numba==0.57.1
|
33 |
+
numpy>=1.22
|
34 |
+
packaging==23.1
|
35 |
+
pandas>=1.5.3
|
36 |
+
pooch==1.6.0
|
37 |
+
psutil==5.9.5
|
38 |
+
pyarrow>=3.0.0
|
39 |
+
pycparser==2.21
|
40 |
+
python-dateutil==2.8.2
|
41 |
+
pytz==2023.3
|
42 |
+
PyYAML==6.0
|
43 |
+
ruamel.yaml==0.17.28
|
44 |
+
ruamel.yaml.clib==0.2.7
|
45 |
+
safetensors==0.3.1
|
46 |
+
scikit-learn==1.3.0
|
47 |
+
scipy==1.11.1
|
48 |
+
sentencepiece==0.1.99
|
49 |
+
six==1.16.0
|
50 |
+
soundfile==0.12.1
|
51 |
+
soxr==0.3.5
|
52 |
+
speechbrain==0.5.14
|
53 |
+
sympy==1.12
|
54 |
+
threadpoolctl==3.2.0
|
55 |
+
tokenizers==0.13.3
|
56 |
+
torch==2.0.1
|
57 |
+
torchaudio==2.0.2
|
58 |
+
tqdm==4.65.0
|
59 |
+
transformers==4.30.2
|
60 |
+
typing_extensions==4.7.1
|
61 |
+
tzdata==2023.3
|
62 |
+
urllib3==2.0.3
|
63 |
+
xxhash==3.2.0
|
64 |
+
yarl==1.9.2
|
scripts/cloning_inference.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
|
5 |
+
import soundfile as sf
|
6 |
+
|
7 |
+
from deep_voice_cloning.cloning.model import CloningModel
|
8 |
+
|
9 |
+
|
10 |
+
if __name__ == "__main__":
|
11 |
+
parser = argparse.ArgumentParser()
|
12 |
+
parser.add_argument("--model_path", type=str, default=None, help="Path to model directory")
|
13 |
+
parser.add_argument("--input_text", type=str, default=None, help="Text to be synthesized")
|
14 |
+
parser.add_argument("--output_path", type=str, default=None, help="Path to output audio file")
|
15 |
+
args = parser.parse_args()
|
16 |
+
|
17 |
+
with open(os.path.join(os.path.dirname(__file__), "inference_config.json")) as f:
|
18 |
+
config = json.load(f)
|
19 |
+
|
20 |
+
if args.model_path is not None:
|
21 |
+
config['model_path'] = args.model_path
|
22 |
+
if args.input_text is not None:
|
23 |
+
config['input_text'] = args.input_text
|
24 |
+
if args.output_path is not None:
|
25 |
+
config['output_path'] = args.output_path
|
26 |
+
|
27 |
+
cloning_model = CloningModel(config)
|
28 |
+
waveform_array = cloning_model.forward(config["input_text"])
|
29 |
+
|
30 |
+
sf.write(config['output_path'], waveform_array, samplerate=16000)
|
scripts/inference_config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_path": "/content/deep-voice-cloning/models/microsoft_speecht5_tts_hank_hill",
|
3 |
+
"speaker_model_name": "speechbrain/spkrec-xvect-voxceleb",
|
4 |
+
"vocoder_name": "microsoft/speecht5_hifigan",
|
5 |
+
"input_text": "do the things, not because they are easy, but because they are hard",
|
6 |
+
"output_path": "/content/deep-voice-cloning/scripts/output/do_the_things.wav"
|
7 |
+
}
|
scripts/input/hank.mp3
ADDED
Binary file (526 kB). View file
|
|
scripts/input/homer.mp3
ADDED
Binary file (913 kB). View file
|
|
scripts/output/.gitkeep
ADDED
File without changes
|
scripts/train.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
|
8 |
+
|
9 |
+
from deep_voice_cloning.cloning.model import CloningModel
|
10 |
+
from deep_voice_cloning.transcriber.model import TranscriberModel
|
11 |
+
from deep_voice_cloning.data.collator import TTSDataCollatorWithPadding
|
12 |
+
from deep_voice_cloning.data.dataset import get_cloning_dataset
|
13 |
+
|
14 |
+
|
15 |
+
if __name__ == "__main__":
|
16 |
+
parser = argparse.ArgumentParser()
|
17 |
+
parser.add_argument("--lang", type=str, default=None, help="Language of speech samples")
|
18 |
+
parser.add_argument("--audio_path", type=str, default=None, help="Path to training audio file")
|
19 |
+
parser.add_argument("--output_dir", type=str, default=None, help="Path to output directory for trained model")
|
20 |
+
args = parser.parse_args()
|
21 |
+
|
22 |
+
with open(os.path.join(os.path.dirname(__file__), "training_config.json")) as f:
|
23 |
+
training_config = json.load(f)
|
24 |
+
|
25 |
+
if args.lang is not None:
|
26 |
+
training_config['lang'] = args.lang
|
27 |
+
if args.audio_path is not None:
|
28 |
+
training_config['audio_path'] = Path(args.audio_path)
|
29 |
+
if args.output_dir is not None:
|
30 |
+
training_config['output_dir'] = Path(args.output_dir)
|
31 |
+
|
32 |
+
transcriber_model = TranscriberModel(lang=training_config['lang'])
|
33 |
+
cloning_model = CloningModel(lang=training_config['lang'])
|
34 |
+
|
35 |
+
dataset = get_cloning_dataset(training_config['audio_path'], transcriber_model, cloning_model)
|
36 |
+
data_collator = TTSDataCollatorWithPadding(processor=cloning_model.processor, model=cloning_model.model)
|
37 |
+
|
38 |
+
training_args = Seq2SeqTrainingArguments(
|
39 |
+
output_dir=training_config["output_dir"],
|
40 |
+
per_device_train_batch_size=training_config['batch_size'],
|
41 |
+
gradient_accumulation_steps=2,
|
42 |
+
overwrite_output_dir=True,
|
43 |
+
learning_rate=training_config['learning_rate'],
|
44 |
+
warmup_steps=training_config['warmup_steps'],
|
45 |
+
max_steps=training_config['max_steps'],
|
46 |
+
gradient_checkpointing=True,
|
47 |
+
fp16=transcriber_model.device == torch.device("cuda"),
|
48 |
+
evaluation_strategy="steps",
|
49 |
+
per_device_eval_batch_size=8,
|
50 |
+
save_strategy="no",
|
51 |
+
eval_steps=100,
|
52 |
+
logging_steps=20,
|
53 |
+
load_best_model_at_end=False,
|
54 |
+
greater_is_better=False,
|
55 |
+
label_names=["labels"],
|
56 |
+
)
|
57 |
+
|
58 |
+
trainer = Seq2SeqTrainer(
|
59 |
+
args=training_args,
|
60 |
+
model=cloning_model.model,
|
61 |
+
train_dataset=dataset,
|
62 |
+
eval_dataset=dataset,
|
63 |
+
data_collator=data_collator,
|
64 |
+
tokenizer=cloning_model.processor.tokenizer,
|
65 |
+
)
|
66 |
+
|
67 |
+
trainer.train()
|
68 |
+
cloning_model.save_pretrained(Path(training_config["output_dir"]) /
|
69 |
+
Path(cloning_model.config['model_path'].replace('/', '_')
|
70 |
+
+ '_' + Path(training_config['audio_path']).stem)
|
71 |
+
)
|
scripts/training_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_path": "/content/deep-voice-cloning/scripts/input/hank_hill.mp3",
|
3 |
+
"output_dir": "/content/deep-voice-cloning/models",
|
4 |
+
"lang": "en",
|
5 |
+
"batch_size": 2,
|
6 |
+
"learning_rate": 1e-4,
|
7 |
+
"max_steps": 300,
|
8 |
+
"warmup_steps": 30
|
9 |
+
}
|
setup.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
from setuptools import find_packages, setup
|
4 |
+
|
5 |
+
README_TEXT = (Path(__file__).parent / "README.md").read_text(encoding="utf-8")
|
6 |
+
|
7 |
+
MAINTAINER = "Konstantin Verner"
|
8 |
+
MAINTAINER_EMAIL = "[email protected]"
|
9 |
+
REQUIRED_PKGS = ["accelerate==0.21.0",
|
10 |
+
"aiohttp==3.8.4",
|
11 |
+
"aiosignal==1.3.1",
|
12 |
+
"appdirs==1.4.4",
|
13 |
+
"async-timeout==4.0.2",
|
14 |
+
"attrs==23.1.0",
|
15 |
+
"audioread==3.0.0",
|
16 |
+
"certifi==2023.5.7",
|
17 |
+
"cffi==1.15.1",
|
18 |
+
"charset-normalizer==3.2.0",
|
19 |
+
"colorama==0.4.6",
|
20 |
+
"datasets==2.13.1",
|
21 |
+
"decorator>=4.0.2",
|
22 |
+
"dill==0.3.6",
|
23 |
+
"filelock==3.12.2",
|
24 |
+
"frozenlist==1.4.0",
|
25 |
+
"fsspec==2023.6.0",
|
26 |
+
"huggingface-hub==0.16.4",
|
27 |
+
"HyperPyYAML==1.2.1",
|
28 |
+
"idna==3.4",
|
29 |
+
"Jinja2==3.1.2",
|
30 |
+
"joblib==1.3.1",
|
31 |
+
"lazy_loader==0.3",
|
32 |
+
"librosa==0.10.0.post2",
|
33 |
+
"llvmlite==0.40.1",
|
34 |
+
"MarkupSafe==2.1.3",
|
35 |
+
"mpmath==1.3.0",
|
36 |
+
"msgpack==1.0.5",
|
37 |
+
"multidict==6.0.4",
|
38 |
+
"multiprocess==0.70.14",
|
39 |
+
"networkx==3.1",
|
40 |
+
"numba==0.57.1",
|
41 |
+
"numpy>=1.22",
|
42 |
+
"packaging==23.1",
|
43 |
+
"pandas>=1.5.3",
|
44 |
+
"pooch==1.6.0",
|
45 |
+
"psutil==5.9.5",
|
46 |
+
"pyarrow>=3.0.0",
|
47 |
+
"pycparser==2.21",
|
48 |
+
"python-dateutil==2.8.2",
|
49 |
+
"pytz==2023.3",
|
50 |
+
"PyYAML==6.0",
|
51 |
+
"ruamel.yaml==0.17.28",
|
52 |
+
"ruamel.yaml.clib==0.2.7",
|
53 |
+
"safetensors==0.3.1",
|
54 |
+
"scikit-learn==1.3.0",
|
55 |
+
"scipy==1.11.1",
|
56 |
+
"sentencepiece==0.1.99",
|
57 |
+
"six==1.16.0",
|
58 |
+
"soundfile==0.12.1",
|
59 |
+
"soxr==0.3.5",
|
60 |
+
"speechbrain==0.5.14",
|
61 |
+
"sympy==1.12",
|
62 |
+
"threadpoolctl==3.2.0",
|
63 |
+
"tokenizers==0.13.3",
|
64 |
+
"torch==2.0.1",
|
65 |
+
"torchaudio==2.0.2",
|
66 |
+
"tqdm==4.65.0",
|
67 |
+
"transformers==4.30.2",
|
68 |
+
"typing_extensions==4.7.1",
|
69 |
+
"tzdata==2023.3",
|
70 |
+
"urllib3==2.0.3",
|
71 |
+
"xxhash==3.2.0",
|
72 |
+
"yarl==1.9.2"]
|
73 |
+
|
74 |
+
print(find_packages("src"))
|
75 |
+
|
76 |
+
setup(
|
77 |
+
name="deep_voice_cloning",
|
78 |
+
version="0.1.0",
|
79 |
+
description="Few-Shot Voice Cloning",
|
80 |
+
long_description=README_TEXT,
|
81 |
+
long_description_content_type="text/markdown",
|
82 |
+
maintainer=MAINTAINER,
|
83 |
+
maintainer_email=MAINTAINER_EMAIL,
|
84 |
+
url="",
|
85 |
+
download_url="",
|
86 |
+
license="MIT",
|
87 |
+
package_dir={"": "src"},
|
88 |
+
packages=find_packages("src"),
|
89 |
+
include_package_data=True,
|
90 |
+
package_data={"": ["*.json"]},
|
91 |
+
install_requires=REQUIRED_PKGS,
|
92 |
+
classifiers=[
|
93 |
+
"Development Status :: 1 - Planning",
|
94 |
+
"Intended Audience :: Developers",
|
95 |
+
"Intended Audience :: Education",
|
96 |
+
"Intended Audience :: Science/Research",
|
97 |
+
"License :: OSI Approved :: MIT",
|
98 |
+
"Operating System :: OS Independent",
|
99 |
+
"Programming Language :: Python :: 3",
|
100 |
+
"Programming Language :: Python :: 3.8",
|
101 |
+
"Programming Language :: Python :: 3.9",
|
102 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
103 |
+
],
|
104 |
+
keywords="asr, machine learning, fewshot learning, transformers",
|
105 |
+
zip_safe=False, # Required for mypy to find the py.typed file
|
106 |
+
)
|
src/deep_voice_cloning/__init__.py
ADDED
File without changes
|
src/deep_voice_cloning/cloning/__init__.py
ADDED
File without changes
|
src/deep_voice_cloning/cloning/config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"en": {
|
3 |
+
"model_path": "microsoft/speecht5_tts",
|
4 |
+
"vocoder_name": "microsoft/speecht5_hifigan",
|
5 |
+
"speaker_model_name": "speechbrain/spkrec-xvect-voxceleb"
|
6 |
+
}
|
7 |
+
}
|
src/deep_voice_cloning/cloning/model.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from typing import Dict
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import torch
|
8 |
+
from speechbrain.pretrained import EncoderClassifier
|
9 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
10 |
+
|
11 |
+
|
12 |
+
class CloningModel:
|
13 |
+
def __init__(self, config: Dict[str, Dict[str, str]] = None, lang: str = 'en'):
|
14 |
+
super(CloningModel, self).__init__()
|
15 |
+
if config is None:
|
16 |
+
self.speaker_embedding = None
|
17 |
+
with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
|
18 |
+
self.config = json.load(f)[lang]
|
19 |
+
else:
|
20 |
+
self.config = config
|
21 |
+
self.speaker_embedding = torch.load(Path(self.config['model_path']) / "speaker_embedding.pt")[0]
|
22 |
+
self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
|
23 |
+
self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
|
24 |
+
self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
|
25 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
26 |
+
self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
|
27 |
+
self.to(self.device)
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
def to(self, device: torch.device):
|
32 |
+
self.model = self.model.to(device)
|
33 |
+
self.vocoder = self.vocoder.to(device)
|
34 |
+
|
35 |
+
def save_pretrained(self, save_directory: str):
|
36 |
+
self.model.save_pretrained(save_directory)
|
37 |
+
self.processor.save_pretrained(save_directory)
|
38 |
+
torch.save(self.speaker_embedding, Path(save_directory) / "speaker_embedding.pt")
|
39 |
+
|
40 |
+
def forward(self, text: str) -> np.array:
|
41 |
+
# tokenize text
|
42 |
+
inputs = self.processor(text=text, return_tensors="pt")
|
43 |
+
# generate spectrogram using backbone model
|
44 |
+
spectrogram = self.model.generate_speech(inputs["input_ids"].to(self.device),
|
45 |
+
self.speaker_embedding.to(self.device))
|
46 |
+
# decode spectrogram into waveform using vocoder
|
47 |
+
with torch.no_grad():
|
48 |
+
waveform_array = self.vocoder(spectrogram).detach().cpu().numpy()
|
49 |
+
return waveform_array
|
50 |
+
|
51 |
+
def create_speaker_embedding(self, waveform: torch.tensor) -> torch.tensor:
|
52 |
+
with torch.no_grad():
|
53 |
+
speaker_embeddings = self.speaker_model.encode_batch(waveform)
|
54 |
+
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
|
55 |
+
self.speaker_embedding = speaker_embeddings
|
56 |
+
speaker_embeddings = speaker_embeddings.squeeze()
|
57 |
+
return speaker_embeddings
|
src/deep_voice_cloning/data/__init__.py
ADDED
File without changes
|
src/deep_voice_cloning/data/collator.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from typing import Any, Dict, List, Union
|
3 |
+
|
4 |
+
|
5 |
+
class TTSDataCollatorWithPadding:
|
6 |
+
|
7 |
+
def __init__(self, model, processor):
|
8 |
+
self.model = model
|
9 |
+
self.processor = processor
|
10 |
+
|
11 |
+
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
12 |
+
input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
|
13 |
+
label_features = [{"input_values": feature["labels"]} for feature in features]
|
14 |
+
speaker_features = [feature["speaker_embeddings"] for feature in features]
|
15 |
+
|
16 |
+
# collate the inputs and targets into a batch
|
17 |
+
batch = self.processor.pad(
|
18 |
+
input_ids=input_ids,
|
19 |
+
labels=label_features,
|
20 |
+
return_tensors="pt",
|
21 |
+
)
|
22 |
+
|
23 |
+
# replace padding with -100 to ignore loss correctly
|
24 |
+
batch["labels"] = batch["labels"].masked_fill(
|
25 |
+
batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
|
26 |
+
)
|
27 |
+
|
28 |
+
# not used during fine-tuning
|
29 |
+
del batch["decoder_attention_mask"]
|
30 |
+
|
31 |
+
# round down target lengths to multiple of reduction factor
|
32 |
+
if self.model.config.reduction_factor > 1:
|
33 |
+
target_lengths = torch.tensor([
|
34 |
+
len(feature["input_values"]) for feature in label_features
|
35 |
+
])
|
36 |
+
target_lengths = target_lengths.new([
|
37 |
+
length - length % self.model.config.reduction_factor for length in target_lengths
|
38 |
+
])
|
39 |
+
max_length = max(target_lengths)
|
40 |
+
batch["labels"] = batch["labels"][:, :max_length]
|
41 |
+
|
42 |
+
# add the speaker embeddings
|
43 |
+
batch["speaker_embeddings"] = torch.tensor(speaker_features)
|
44 |
+
|
45 |
+
return batch
|
src/deep_voice_cloning/data/dataset.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import librosa
|
5 |
+
import numpy as np
|
6 |
+
from datasets import Dataset
|
7 |
+
|
8 |
+
from ..cloning.model import CloningModel
|
9 |
+
from ..transcriber.model import TranscriberModel
|
10 |
+
|
11 |
+
|
12 |
+
def prepare_dataset(example: Dict[str, Any], model: CloningModel) -> Dict[str, Any]:
|
13 |
+
"""
|
14 |
+
Prepare a single example for training
|
15 |
+
"""
|
16 |
+
# feature extraction and tokenization
|
17 |
+
processed_example = model.processor(
|
18 |
+
text=example["normalized_text"],
|
19 |
+
audio_target=example["audio"]["array"],
|
20 |
+
sampling_rate=16000,
|
21 |
+
return_attention_mask=False,
|
22 |
+
)
|
23 |
+
|
24 |
+
# strip off the batch dimension
|
25 |
+
if len(torch.tensor(processed_example['input_ids']).shape) > 1:
|
26 |
+
processed_example['input_ids'] = processed_example['input_ids'][0]
|
27 |
+
|
28 |
+
processed_example["labels"] = processed_example["labels"][0]
|
29 |
+
|
30 |
+
# use SpeechBrain to obtain x-vector
|
31 |
+
processed_example["speaker_embeddings"] = model.create_speaker_embedding(
|
32 |
+
torch.tensor(example["audio"]["array"])
|
33 |
+
).numpy()
|
34 |
+
|
35 |
+
return processed_example
|
36 |
+
|
37 |
+
|
38 |
+
def get_cloning_dataset(input_audio_path: str,
|
39 |
+
transcriber_model: TranscriberModel,
|
40 |
+
cloning_model: CloningModel,
|
41 |
+
sampling_rate: int = 16000,
|
42 |
+
window_size_secs: int = 5) -> Dataset:
|
43 |
+
"""
|
44 |
+
Create dataset by transcribing an audio file using a pretrained Wav2Vec2 model.
|
45 |
+
"""
|
46 |
+
speech_array, _ = librosa.load(input_audio_path, sr=sampling_rate)
|
47 |
+
|
48 |
+
# split a waveform into splits of 5 secs each
|
49 |
+
speech_arrays = np.split(speech_array, range(0, len(speech_array), window_size_secs * sampling_rate))[1:]
|
50 |
+
texts = [transcriber_model.forward(speech_array, sampling_rate=sampling_rate)
|
51 |
+
for speech_array in speech_arrays]
|
52 |
+
|
53 |
+
dataset = Dataset.from_list([
|
54 |
+
{'audio': {'array': speech_arrays[i]}, 'normalized_text': texts[i]}
|
55 |
+
for i in range(len(speech_arrays))]
|
56 |
+
)
|
57 |
+
|
58 |
+
dataset = dataset.map(
|
59 |
+
prepare_dataset, fn_kwargs={'model': cloning_model},
|
60 |
+
remove_columns=dataset.column_names,
|
61 |
+
)
|
62 |
+
|
63 |
+
return dataset
|
src/deep_voice_cloning/transcriber/__init__.py
ADDED
File without changes
|
src/deep_voice_cloning/transcriber/config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"language_model_names": {
|
3 |
+
"en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
|
4 |
+
"fr": "jonatasgrosman/wav2vec2-large-xlsr-53-french",
|
5 |
+
"de": "jonatasgrosman/wav2vec2-large-xlsr-53-german"
|
6 |
+
}
|
7 |
+
}
|
src/deep_voice_cloning/transcriber/model.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
7 |
+
|
8 |
+
|
9 |
+
class TranscriberModel:
|
10 |
+
def __init__(self, lang: str = 'en'):
|
11 |
+
with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
|
12 |
+
config = json.load(f)
|
13 |
+
self.processor = Wav2Vec2Processor.from_pretrained(config['language_model_names'][lang])
|
14 |
+
self.model = Wav2Vec2ForCTC.from_pretrained(config['language_model_names'][lang])
|
15 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
16 |
+
|
17 |
+
def forward(self, speech_array: np.array, sampling_rate: int = 16000) -> str:
|
18 |
+
model_input = self.processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
|
19 |
+
with torch.no_grad():
|
20 |
+
logits = self.model(model_input.input_values, attention_mask=model_input.attention_mask).logits
|
21 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
22 |
+
return self.processor.batch_decode(predicted_ids)
|