deep-voice-cloning

Runtime error

App Files Files Community

mygyasir

konverner commited on Aug 15, 2023

Commit

13c43fe

•

0 Parent(s):

Duplicate from konverner/deep-voice-cloning

Browse files

Co-authored-by: Konstantin Verner <[email protected]>

Files changed (42) hide show

.gitignore +169 -0
Dockerfile +4 -0
LICENSE +21 -0
README.md +10 -0
app.py +29 -0
build/lib/deep_voice_cloning/__init__.py +0 -0
build/lib/deep_voice_cloning/cloning/__init__.py +0 -0
build/lib/deep_voice_cloning/cloning/config.json +7 -0
build/lib/deep_voice_cloning/cloning/model.py +57 -0
build/lib/deep_voice_cloning/data/__init__.py +0 -0
build/lib/deep_voice_cloning/data/collator.py +45 -0
build/lib/deep_voice_cloning/data/dataset.py +63 -0
build/lib/deep_voice_cloning/transcriber/__init__.py +0 -0
build/lib/deep_voice_cloning/transcriber/config.json +7 -0
build/lib/deep_voice_cloning/transcriber/model.py +22 -0
models/.gitkeep +0 -0
notebooks/.gitkeep +0 -0
notebooks/CLI_Example.ipynb +0 -0
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/classifier.ckpt +1 -0
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/embedding_model.ckpt +1 -0
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/hyperparams.yaml +1 -0
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/label_encoder.ckpt +1 -0
pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/mean_var_norm_emb.ckpt +1 -0
requirements.txt +64 -0
scripts/cloning_inference.py +30 -0
scripts/inference_config.json +7 -0
scripts/input/hank.mp3 +0 -0
scripts/input/homer.mp3 +0 -0
scripts/output/.gitkeep +0 -0
scripts/train.py +71 -0
scripts/training_config.json +9 -0
setup.py +106 -0
src/deep_voice_cloning/__init__.py +0 -0
src/deep_voice_cloning/cloning/__init__.py +0 -0
src/deep_voice_cloning/cloning/config.json +7 -0
src/deep_voice_cloning/cloning/model.py +57 -0
src/deep_voice_cloning/data/__init__.py +0 -0
src/deep_voice_cloning/data/collator.py +45 -0
src/deep_voice_cloning/data/dataset.py +63 -0
src/deep_voice_cloning/transcriber/__init__.py +0 -0
src/deep_voice_cloning/transcriber/config.json +7 -0
src/deep_voice_cloning/transcriber/model.py +22 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,169 @@

+# Initially taken from Github's Python gitignore file
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# tests and logs
+tests/fixtures/cached_*_text.txt
+logs/
+lightning_logs/
+lang_code_data/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vs
+.vscode
+# Pycharm
+.idea
+# TF code
+tensorflow_code
+# Models
+proc_data
+# examples
+runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep
+# data
+/data
+serialization_dir
+# emacs
+*.*~
+debug.env
+# vim
+.*.swp
+#ctags
+tags
+# pre-commit
+.pre-commit*
+# .lock
+*.lock
+# DS_Store (MacOS)
+.DS_Store
+# ruff
+.ruff_cache

Dockerfile ADDED Viewed

	@@ -0,0 +1,4 @@

+FROM python:3.9
+MAINTAINER Konstantin Verner <[email protected]>
+COPY . .
+RUN pip install .

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Konstantin Verner
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+license: openrail
+title: Deep Voice Cloning
+sdk: gradio
+emoji: 🌖
+colorFrom: yellow
+colorTo: purple
+pinned: true
+duplicated_from: konverner/deep-voice-cloning
+---

app.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+from pathlib import Path
+import gradio as gr
+os.system('pip install .')
+def greet(text, audio_file_path, progress=gr.Progress()):
+    text = "%s" % text
+    audio_file_path = "%s" % audio_file_path
+    out_path = Path("scripts/output/audio.wav")
+    progress(0.2, desc="Training voice embedding... (aprx 20 mins)")
+    os.system(f'python scripts/train.py --audio_path {audio_file_path}\
+     --output_dir "models"')
+    progress(0.9, desc="Generating voice...")
+    os.system(f'python scripts/cloning_inference.py --model_path "models/microsoft_speecht5_tts_{Path(audio_file_path).stem}"\
+     --input_text "{text}" --output_path "{str(out_path)}"')
+    return out_path
+demo = gr.Interface(
+    fn=greet,
+    inputs=[gr.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),
+            gr.Audio(type="filepath", source="upload", label='Upload a voice to clone (max. 50mb)')],
+    outputs="audio",
+    title="Deep Voice Cloning Tool"
+    )
+demo.launch()

build/lib/deep_voice_cloning/__init__.py ADDED Viewed

File without changes

build/lib/deep_voice_cloning/cloning/__init__.py ADDED Viewed

File without changes

build/lib/deep_voice_cloning/cloning/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "en": {
+      "model_path": "microsoft/speecht5_tts",
+      "vocoder_name": "microsoft/speecht5_hifigan",
+      "speaker_model_name": "speechbrain/spkrec-xvect-voxceleb"
+    }
+}

build/lib/deep_voice_cloning/cloning/model.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+import json
+from typing import Dict
+from pathlib import Path
+import numpy as np
+import torch
+from speechbrain.pretrained import EncoderClassifier
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+class CloningModel:
+    def __init__(self, config: Dict[str, Dict[str, str]] = None, lang: str = 'en'):
+        super(CloningModel, self).__init__()
+        if config is None:
+            self.speaker_embedding = None
+            with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
+                self.config = json.load(f)[lang]
+        else:
+            self.config = config
+            self.speaker_embedding = torch.load(Path(self.config['model_path']) / "speaker_embedding.pt")[0]
+        self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
+        self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
+        self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
+        self.to(self.device)
+    def to(self, device: torch.device):
+        self.model = self.model.to(device)
+        self.vocoder = self.vocoder.to(device)
+    def save_pretrained(self, save_directory: str):
+        self.model.save_pretrained(save_directory)
+        self.processor.save_pretrained(save_directory)
+        torch.save(self.speaker_embedding, Path(save_directory) / "speaker_embedding.pt")
+    def forward(self, text: str) -> np.array:
+        # tokenize text
+        inputs = self.processor(text=text, return_tensors="pt")
+        # generate spectrogram using backbone model
+        spectrogram = self.model.generate_speech(inputs["input_ids"].to(self.device),
+                                                 self.speaker_embedding.to(self.device))
+        # decode spectrogram into waveform using vocoder
+        with torch.no_grad():
+            waveform_array = self.vocoder(spectrogram).detach().cpu().numpy()
+        return waveform_array
+    def create_speaker_embedding(self, waveform: torch.tensor) -> torch.tensor:
+        with torch.no_grad():
+            speaker_embeddings = self.speaker_model.encode_batch(waveform)
+            speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
+            self.speaker_embedding = speaker_embeddings
+            speaker_embeddings = speaker_embeddings.squeeze()
+        return speaker_embeddings

build/lib/deep_voice_cloning/data/__init__.py ADDED Viewed

File without changes

build/lib/deep_voice_cloning/data/collator.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+from typing import Any, Dict, List, Union
+class TTSDataCollatorWithPadding:
+    def __init__(self, model, processor):
+        self.model = model
+        self.processor = processor
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
+        label_features = [{"input_values": feature["labels"]} for feature in features]
+        speaker_features = [feature["speaker_embeddings"] for feature in features]
+        # collate the inputs and targets into a batch
+        batch = self.processor.pad(
+            input_ids=input_ids,
+            labels=label_features,
+            return_tensors="pt",
+        )
+        # replace padding with -100 to ignore loss correctly
+        batch["labels"] = batch["labels"].masked_fill(
+            batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
+        )
+        # not used during fine-tuning
+        del batch["decoder_attention_mask"]
+        # round down target lengths to multiple of reduction factor
+        if self.model.config.reduction_factor > 1:
+            target_lengths = torch.tensor([
+                len(feature["input_values"]) for feature in label_features
+            ])
+            target_lengths = target_lengths.new([
+                length - length % self.model.config.reduction_factor for length in target_lengths
+            ])
+            max_length = max(target_lengths)
+            batch["labels"] = batch["labels"][:, :max_length]
+        # add the speaker embeddings
+        batch["speaker_embeddings"] = torch.tensor(speaker_features)
+        return batch

build/lib/deep_voice_cloning/data/dataset.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from typing import Dict, Any
+import torch
+import librosa
+import numpy as np
+from datasets import Dataset
+from ..cloning.model import CloningModel
+from ..transcriber.model import TranscriberModel
+def prepare_dataset(example: Dict[str, Any], model: CloningModel) -> Dict[str, Any]:
+    """
+    Prepare a single example for training
+    """
+    # feature extraction and tokenization
+    processed_example = model.processor(
+        text=example["normalized_text"],
+        audio_target=example["audio"]["array"],
+        sampling_rate=16000,
+        return_attention_mask=False,
+    )
+    # strip off the batch dimension
+    if len(torch.tensor(processed_example['input_ids']).shape) > 1:
+        processed_example['input_ids'] = processed_example['input_ids'][0]
+    processed_example["labels"] = processed_example["labels"][0]
+    # use SpeechBrain to obtain x-vector
+    processed_example["speaker_embeddings"] = model.create_speaker_embedding(
+        torch.tensor(example["audio"]["array"])
+    ).numpy()
+    return processed_example
+def get_cloning_dataset(input_audio_path: str,
+                        transcriber_model: TranscriberModel,
+                        cloning_model: CloningModel,
+                        sampling_rate: int = 16000,
+                        window_size_secs: int = 5) -> Dataset:
+    """
+    Create dataset by transcribing an audio file using a pretrained Wav2Vec2 model.
+    """
+    speech_array, _ = librosa.load(input_audio_path, sr=sampling_rate)
+    # split a waveform into splits of 5 secs each
+    speech_arrays = np.split(speech_array, range(0, len(speech_array), window_size_secs * sampling_rate))[1:]
+    texts = [transcriber_model.forward(speech_array, sampling_rate=sampling_rate)
+             for speech_array in speech_arrays]
+    dataset = Dataset.from_list([
+        {'audio': {'array': speech_arrays[i]}, 'normalized_text': texts[i]}
+        for i in range(len(speech_arrays))]
+    )
+    dataset = dataset.map(
+        prepare_dataset, fn_kwargs={'model': cloning_model},
+        remove_columns=dataset.column_names,
+    )
+    return dataset

build/lib/deep_voice_cloning/transcriber/__init__.py ADDED Viewed

File without changes

build/lib/deep_voice_cloning/transcriber/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "language_model_names": {
+        "en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
+        "fr": "jonatasgrosman/wav2vec2-large-xlsr-53-french",
+        "de": "jonatasgrosman/wav2vec2-large-xlsr-53-german"
+    }
+}

build/lib/deep_voice_cloning/transcriber/model.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+import json
+import numpy as np
+import torch
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+class TranscriberModel:
+    def __init__(self, lang: str = 'en'):
+        with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
+            config = json.load(f)
+        self.processor = Wav2Vec2Processor.from_pretrained(config['language_model_names'][lang])
+        self.model = Wav2Vec2ForCTC.from_pretrained(config['language_model_names'][lang])
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def forward(self, speech_array: np.array, sampling_rate: int = 16000) -> str:
+        model_input = self.processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
+        with torch.no_grad():
+            logits = self.model(model_input.input_values, attention_mask=model_input.attention_mask).logits
+            predicted_ids = torch.argmax(logits, dim=-1)
+        return self.processor.batch_decode(predicted_ids)

models/.gitkeep ADDED Viewed

File without changes

notebooks/.gitkeep ADDED Viewed

File without changes

notebooks/CLI_Example.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/classifier.ckpt ADDED Viewed

	@@ -0,0 +1 @@


1	+ C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/classifier.ckpt

pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/embedding_model.ckpt ADDED Viewed

	@@ -0,0 +1 @@


1	+ C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/embedding_model.ckpt

pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/hyperparams.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/hyperparams.yaml

pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/label_encoder.ckpt ADDED Viewed

	@@ -0,0 +1 @@


1	+ C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/label_encoder.txt

pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/mean_var_norm_emb.ckpt ADDED Viewed

	@@ -0,0 +1 @@


1	+ C:/Users/konst/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/e2cc27f853f99bd5d539432f0cba3f124c059f71/mean_var_norm_emb.ckpt

requirements.txt ADDED Viewed

	@@ -0,0 +1,64 @@

+accelerate==0.21.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+appdirs==1.4.4
+async-timeout==4.0.2
+attrs==23.1.0
+audioread==3.0.0
+certifi==2023.5.7
+cffi==1.15.1
+charset-normalizer==3.2.0
+colorama==0.4.6
+datasets==2.13.1
+decorator>=4.0.2
+dill==0.3.6
+filelock==3.12.2
+frozenlist==1.4.0
+fsspec==2023.6.0
+huggingface-hub==0.16.4
+HyperPyYAML==1.2.1
+idna==3.4
+Jinja2==3.1.2
+joblib==1.3.1
+lazy_loader==0.3
+librosa==0.10.0.post2
+llvmlite==0.40.1
+MarkupSafe==2.1.3
+mpmath==1.3.0
+msgpack==1.0.5
+multidict==6.0.4
+multiprocess==0.70.14
+networkx==3.1
+numba==0.57.1
+numpy>=1.22
+packaging==23.1
+pandas>=1.5.3
+pooch==1.6.0
+psutil==5.9.5
+pyarrow>=3.0.0
+pycparser==2.21
+python-dateutil==2.8.2
+pytz==2023.3
+PyYAML==6.0
+ruamel.yaml==0.17.28
+ruamel.yaml.clib==0.2.7
+safetensors==0.3.1
+scikit-learn==1.3.0
+scipy==1.11.1
+sentencepiece==0.1.99
+six==1.16.0
+soundfile==0.12.1
+soxr==0.3.5
+speechbrain==0.5.14
+sympy==1.12
+threadpoolctl==3.2.0
+tokenizers==0.13.3
+torch==2.0.1
+torchaudio==2.0.2
+tqdm==4.65.0
+transformers==4.30.2
+typing_extensions==4.7.1
+tzdata==2023.3
+urllib3==2.0.3
+xxhash==3.2.0
+yarl==1.9.2

scripts/cloning_inference.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import argparse
+import json
+import os
+import soundfile as sf
+from deep_voice_cloning.cloning.model import CloningModel
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, default=None, help="Path to model directory")
+    parser.add_argument("--input_text", type=str, default=None, help="Text to be synthesized")
+    parser.add_argument("--output_path", type=str, default=None, help="Path to output audio file")
+    args = parser.parse_args()
+    with open(os.path.join(os.path.dirname(__file__), "inference_config.json")) as f:
+        config = json.load(f)
+    if args.model_path is not None:
+        config['model_path'] = args.model_path
+    if args.input_text is not None:
+        config['input_text'] = args.input_text
+    if args.output_path is not None:
+        config['output_path'] = args.output_path
+    cloning_model = CloningModel(config)
+    waveform_array = cloning_model.forward(config["input_text"])
+    sf.write(config['output_path'], waveform_array, samplerate=16000)

scripts/inference_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "model_path": "/content/deep-voice-cloning/models/microsoft_speecht5_tts_hank_hill",
+    "speaker_model_name": "speechbrain/spkrec-xvect-voxceleb",
+    "vocoder_name": "microsoft/speecht5_hifigan",
+    "input_text": "do the things, not because they are easy, but because they are hard",
+    "output_path": "/content/deep-voice-cloning/scripts/output/do_the_things.wav"
+}

scripts/input/hank.mp3 ADDED Viewed

Binary file (526 kB). View file

scripts/input/homer.mp3 ADDED Viewed

Binary file (913 kB). View file

scripts/output/.gitkeep ADDED Viewed

File without changes

scripts/train.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import argparse
+import json
+import os
+from pathlib import Path
+import torch
+from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
+from deep_voice_cloning.cloning.model import CloningModel
+from deep_voice_cloning.transcriber.model import TranscriberModel
+from deep_voice_cloning.data.collator import TTSDataCollatorWithPadding
+from deep_voice_cloning.data.dataset import get_cloning_dataset
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lang", type=str, default=None, help="Language of speech samples")
+    parser.add_argument("--audio_path", type=str, default=None, help="Path to training audio file")
+    parser.add_argument("--output_dir", type=str, default=None, help="Path to output directory for trained model")
+    args = parser.parse_args()
+    with open(os.path.join(os.path.dirname(__file__), "training_config.json")) as f:
+        training_config = json.load(f)
+    if args.lang is not None:
+        training_config['lang'] = args.lang
+    if args.audio_path is not None:
+        training_config['audio_path'] = Path(args.audio_path)
+    if args.output_dir is not None:
+        training_config['output_dir'] = Path(args.output_dir)
+    transcriber_model = TranscriberModel(lang=training_config['lang'])
+    cloning_model = CloningModel(lang=training_config['lang'])
+    dataset = get_cloning_dataset(training_config['audio_path'], transcriber_model, cloning_model)
+    data_collator = TTSDataCollatorWithPadding(processor=cloning_model.processor, model=cloning_model.model)
+    training_args = Seq2SeqTrainingArguments(
+        output_dir=training_config["output_dir"],
+        per_device_train_batch_size=training_config['batch_size'],
+        gradient_accumulation_steps=2,
+        overwrite_output_dir=True,
+        learning_rate=training_config['learning_rate'],
+        warmup_steps=training_config['warmup_steps'],
+        max_steps=training_config['max_steps'],
+        gradient_checkpointing=True,
+        fp16=transcriber_model.device == torch.device("cuda"),
+        evaluation_strategy="steps",
+        per_device_eval_batch_size=8,
+        save_strategy="no",
+        eval_steps=100,
+        logging_steps=20,
+        load_best_model_at_end=False,
+        greater_is_better=False,
+        label_names=["labels"],
+    )
+    trainer = Seq2SeqTrainer(
+        args=training_args,
+        model=cloning_model.model,
+        train_dataset=dataset,
+        eval_dataset=dataset,
+        data_collator=data_collator,
+        tokenizer=cloning_model.processor.tokenizer,
+    )
+    trainer.train()
+    cloning_model.save_pretrained(Path(training_config["output_dir"]) /
+                                  Path(cloning_model.config['model_path'].replace('/', '_')
+                                       + '_' + Path(training_config['audio_path']).stem)
+                                  )

scripts/training_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "audio_path": "/content/deep-voice-cloning/scripts/input/hank_hill.mp3",
+    "output_dir": "/content/deep-voice-cloning/models",
+    "lang": "en",
+    "batch_size": 2,
+    "learning_rate": 1e-4,
+    "max_steps": 300,
+    "warmup_steps": 30
+}

setup.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from pathlib import Path
+from setuptools import find_packages, setup
+README_TEXT = (Path(__file__).parent / "README.md").read_text(encoding="utf-8")
+MAINTAINER = "Konstantin Verner"
+MAINTAINER_EMAIL = "[email protected]"
+REQUIRED_PKGS = ["accelerate==0.21.0",
+                 "aiohttp==3.8.4",
+                 "aiosignal==1.3.1",
+                 "appdirs==1.4.4",
+                 "async-timeout==4.0.2",
+                 "attrs==23.1.0",
+                 "audioread==3.0.0",
+                 "certifi==2023.5.7",
+                 "cffi==1.15.1",
+                 "charset-normalizer==3.2.0",
+                 "colorama==0.4.6",
+                 "datasets==2.13.1",
+                 "decorator>=4.0.2",
+                 "dill==0.3.6",
+                 "filelock==3.12.2",
+                 "frozenlist==1.4.0",
+                 "fsspec==2023.6.0",
+                 "huggingface-hub==0.16.4",
+                 "HyperPyYAML==1.2.1",
+                 "idna==3.4",
+                 "Jinja2==3.1.2",
+                 "joblib==1.3.1",
+                 "lazy_loader==0.3",
+                 "librosa==0.10.0.post2",
+                 "llvmlite==0.40.1",
+                 "MarkupSafe==2.1.3",
+                 "mpmath==1.3.0",
+                 "msgpack==1.0.5",
+                 "multidict==6.0.4",
+                 "multiprocess==0.70.14",
+                 "networkx==3.1",
+                 "numba==0.57.1",
+                 "numpy>=1.22",
+                 "packaging==23.1",
+                 "pandas>=1.5.3",
+                 "pooch==1.6.0",
+                 "psutil==5.9.5",
+                 "pyarrow>=3.0.0",
+                 "pycparser==2.21",
+                 "python-dateutil==2.8.2",
+                 "pytz==2023.3",
+                 "PyYAML==6.0",
+                 "ruamel.yaml==0.17.28",
+                 "ruamel.yaml.clib==0.2.7",
+                 "safetensors==0.3.1",
+                 "scikit-learn==1.3.0",
+                 "scipy==1.11.1",
+                 "sentencepiece==0.1.99",
+                 "six==1.16.0",
+                 "soundfile==0.12.1",
+                 "soxr==0.3.5",
+                 "speechbrain==0.5.14",
+                 "sympy==1.12",
+                 "threadpoolctl==3.2.0",
+                 "tokenizers==0.13.3",
+                 "torch==2.0.1",
+                 "torchaudio==2.0.2",
+                 "tqdm==4.65.0",
+                 "transformers==4.30.2",
+                 "typing_extensions==4.7.1",
+                 "tzdata==2023.3",
+                 "urllib3==2.0.3",
+                 "xxhash==3.2.0",
+                 "yarl==1.9.2"]
+print(find_packages("src"))
+setup(
+    name="deep_voice_cloning",
+    version="0.1.0",
+    description="Few-Shot Voice Cloning",
+    long_description=README_TEXT,
+    long_description_content_type="text/markdown",
+    maintainer=MAINTAINER,
+    maintainer_email=MAINTAINER_EMAIL,
+    url="",
+    download_url="",
+    license="MIT",
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    include_package_data=True,
+    package_data={"": ["*.json"]},
+    install_requires=REQUIRED_PKGS,
+    classifiers=[
+        "Development Status :: 1 - Planning",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+    keywords="asr, machine learning, fewshot learning, transformers",
+    zip_safe=False,  # Required for mypy to find the py.typed file
+)

src/deep_voice_cloning/__init__.py ADDED Viewed

File without changes

src/deep_voice_cloning/cloning/__init__.py ADDED Viewed

File without changes

src/deep_voice_cloning/cloning/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "en": {
+      "model_path": "microsoft/speecht5_tts",
+      "vocoder_name": "microsoft/speecht5_hifigan",
+      "speaker_model_name": "speechbrain/spkrec-xvect-voxceleb"
+    }
+}

src/deep_voice_cloning/cloning/model.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+import json
+from typing import Dict
+from pathlib import Path
+import numpy as np
+import torch
+from speechbrain.pretrained import EncoderClassifier
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+class CloningModel:
+    def __init__(self, config: Dict[str, Dict[str, str]] = None, lang: str = 'en'):
+        super(CloningModel, self).__init__()
+        if config is None:
+            self.speaker_embedding = None
+            with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
+                self.config = json.load(f)[lang]
+        else:
+            self.config = config
+            self.speaker_embedding = torch.load(Path(self.config['model_path']) / "speaker_embedding.pt")[0]
+        self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
+        self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
+        self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
+        self.to(self.device)
+    def to(self, device: torch.device):
+        self.model = self.model.to(device)
+        self.vocoder = self.vocoder.to(device)
+    def save_pretrained(self, save_directory: str):
+        self.model.save_pretrained(save_directory)
+        self.processor.save_pretrained(save_directory)
+        torch.save(self.speaker_embedding, Path(save_directory) / "speaker_embedding.pt")
+    def forward(self, text: str) -> np.array:
+        # tokenize text
+        inputs = self.processor(text=text, return_tensors="pt")
+        # generate spectrogram using backbone model
+        spectrogram = self.model.generate_speech(inputs["input_ids"].to(self.device),
+                                                 self.speaker_embedding.to(self.device))
+        # decode spectrogram into waveform using vocoder
+        with torch.no_grad():
+            waveform_array = self.vocoder(spectrogram).detach().cpu().numpy()
+        return waveform_array
+    def create_speaker_embedding(self, waveform: torch.tensor) -> torch.tensor:
+        with torch.no_grad():
+            speaker_embeddings = self.speaker_model.encode_batch(waveform)
+            speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
+            self.speaker_embedding = speaker_embeddings
+            speaker_embeddings = speaker_embeddings.squeeze()
+        return speaker_embeddings

src/deep_voice_cloning/data/__init__.py ADDED Viewed

File without changes

src/deep_voice_cloning/data/collator.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+from typing import Any, Dict, List, Union
+class TTSDataCollatorWithPadding:
+    def __init__(self, model, processor):
+        self.model = model
+        self.processor = processor
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
+        label_features = [{"input_values": feature["labels"]} for feature in features]
+        speaker_features = [feature["speaker_embeddings"] for feature in features]
+        # collate the inputs and targets into a batch
+        batch = self.processor.pad(
+            input_ids=input_ids,
+            labels=label_features,
+            return_tensors="pt",
+        )
+        # replace padding with -100 to ignore loss correctly
+        batch["labels"] = batch["labels"].masked_fill(
+            batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
+        )
+        # not used during fine-tuning
+        del batch["decoder_attention_mask"]
+        # round down target lengths to multiple of reduction factor
+        if self.model.config.reduction_factor > 1:
+            target_lengths = torch.tensor([
+                len(feature["input_values"]) for feature in label_features
+            ])
+            target_lengths = target_lengths.new([
+                length - length % self.model.config.reduction_factor for length in target_lengths
+            ])
+            max_length = max(target_lengths)
+            batch["labels"] = batch["labels"][:, :max_length]
+        # add the speaker embeddings
+        batch["speaker_embeddings"] = torch.tensor(speaker_features)
+        return batch

src/deep_voice_cloning/data/dataset.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from typing import Dict, Any
+import torch
+import librosa
+import numpy as np
+from datasets import Dataset
+from ..cloning.model import CloningModel
+from ..transcriber.model import TranscriberModel
+def prepare_dataset(example: Dict[str, Any], model: CloningModel) -> Dict[str, Any]:
+    """
+    Prepare a single example for training
+    """
+    # feature extraction and tokenization
+    processed_example = model.processor(
+        text=example["normalized_text"],
+        audio_target=example["audio"]["array"],
+        sampling_rate=16000,
+        return_attention_mask=False,
+    )
+    # strip off the batch dimension
+    if len(torch.tensor(processed_example['input_ids']).shape) > 1:
+        processed_example['input_ids'] = processed_example['input_ids'][0]
+    processed_example["labels"] = processed_example["labels"][0]
+    # use SpeechBrain to obtain x-vector
+    processed_example["speaker_embeddings"] = model.create_speaker_embedding(
+        torch.tensor(example["audio"]["array"])
+    ).numpy()
+    return processed_example
+def get_cloning_dataset(input_audio_path: str,
+                        transcriber_model: TranscriberModel,
+                        cloning_model: CloningModel,
+                        sampling_rate: int = 16000,
+                        window_size_secs: int = 5) -> Dataset:
+    """
+    Create dataset by transcribing an audio file using a pretrained Wav2Vec2 model.
+    """
+    speech_array, _ = librosa.load(input_audio_path, sr=sampling_rate)
+    # split a waveform into splits of 5 secs each
+    speech_arrays = np.split(speech_array, range(0, len(speech_array), window_size_secs * sampling_rate))[1:]
+    texts = [transcriber_model.forward(speech_array, sampling_rate=sampling_rate)
+             for speech_array in speech_arrays]
+    dataset = Dataset.from_list([
+        {'audio': {'array': speech_arrays[i]}, 'normalized_text': texts[i]}
+        for i in range(len(speech_arrays))]
+    )
+    dataset = dataset.map(
+        prepare_dataset, fn_kwargs={'model': cloning_model},
+        remove_columns=dataset.column_names,
+    )
+    return dataset

src/deep_voice_cloning/transcriber/__init__.py ADDED Viewed

File without changes

src/deep_voice_cloning/transcriber/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "language_model_names": {
+        "en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
+        "fr": "jonatasgrosman/wav2vec2-large-xlsr-53-french",
+        "de": "jonatasgrosman/wav2vec2-large-xlsr-53-german"
+    }
+}

src/deep_voice_cloning/transcriber/model.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+import json
+import numpy as np
+import torch
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+class TranscriberModel:
+    def __init__(self, lang: str = 'en'):
+        with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
+            config = json.load(f)
+        self.processor = Wav2Vec2Processor.from_pretrained(config['language_model_names'][lang])
+        self.model = Wav2Vec2ForCTC.from_pretrained(config['language_model_names'][lang])
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def forward(self, speech_array: np.array, sampling_rate: int = 16000) -> str:
+        model_input = self.processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
+        with torch.no_grad():
+            logits = self.model(model_input.input_values, attention_mask=model_input.attention_mask).logits
+            predicted_ids = torch.argmax(logits, dim=-1)
+        return self.processor.batch_decode(predicted_ids)