File size: 11,962 Bytes

183464b

import os

import torch
from trainer import Trainer, TrainerArgs

from TTS.config import load_config
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.managers import save_file
from tqdm import tqdm
import json
# import gdown
import tarfile

torch.set_num_threads(24)


def nemo(root_path, meta_file, **kwargs):
    """
    Normalizes NeMo-style json manifest files to TTS format
    """
    meta_path = os.path.join(root_path, meta_file)
    items = []
    with open(meta_path, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = json.loads(line)
            wav_file = cols["audio_filepath"]
            text = cols["text"]
            speaker_name = cols["speaker_name"] if "speaker_name" in cols else "one"
            language = cols["language"] if "language" in cols else ""
            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "language": language, "root_path": root_path})
    return items


def compute_embeddings(
    model_path,
    config_path,
    output_path,
    old_speakers_file=None,
    old_append=False,
    config_dataset_path=None,
    formatter=None,
    dataset_name=None,
    dataset_path=None,
    meta_file_train=None,
    meta_file_val=None,
    disable_cuda=False,
    no_eval=False,
):
    use_cuda = torch.cuda.is_available() and not disable_cuda

    if config_dataset_path is not None:
        c_dataset = load_config(config_dataset_path)
        meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
    else:
        c_dataset = BaseDatasetConfig()
        c_dataset.dataset_name = dataset_name
        c_dataset.path = dataset_path
        if meta_file_train is not None:
            c_dataset.meta_file_train = meta_file_train
        if meta_file_val is not None:
            c_dataset.meta_file_val = meta_file_val
        meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval, formatter=formatter)

    if meta_data_eval is None:
        samples = meta_data_train
    else:
        samples = meta_data_train + meta_data_eval

    encoder_manager = SpeakerManager(
        encoder_model_path=model_path,
        encoder_config_path=config_path,
        d_vectors_file_path=old_speakers_file,
        use_cuda=use_cuda,
    )

    class_name_key = encoder_manager.encoder_config.class_name_key

    # compute speaker embeddings
    if old_speakers_file is not None and old_append:
        speaker_mapping = encoder_manager.embeddings
    else:
        speaker_mapping = {}

    for fields in tqdm(samples):
        class_name = fields[class_name_key]
        audio_file = fields["audio_file"]
        embedding_key = fields["audio_unique_name"]

        # Only update the speaker name when the embedding is already in the old file.
        if embedding_key in speaker_mapping:
            speaker_mapping[embedding_key]["name"] = class_name
            continue

        if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
            # get the embedding from the old file
            embedd = encoder_manager.get_embedding_by_clip(embedding_key)
        else:
            # extract the embedding
            embedd = encoder_manager.compute_embedding_from_clip(audio_file)

        # create speaker_mapping if target dataset is defined
        speaker_mapping[embedding_key] = {}
        speaker_mapping[embedding_key]["name"] = class_name
        speaker_mapping[embedding_key]["embedding"] = embedd

    if speaker_mapping:
        # save speaker_mapping if target dataset is defined
        if os.path.isdir(output_path):
            mapping_file_path = os.path.join(output_path, "speakers.pth")
        else:
            mapping_file_path = output_path

        if os.path.dirname(mapping_file_path) != "":
            os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)

        save_file(speaker_mapping, mapping_file_path)
        print("Speaker embeddings saved at:", mapping_file_path)

OUT_PATH = "yourtts_hausa"
LANG_NAME = "hausa"
ISO = "ha"

# Name of the run for the Trainer
RUN_NAME = f"YourTTS-{LANG_NAME.capitalize()}"

# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that can be downloaded here:  https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
RESTORE_PATH = os.path.join(OUT_PATH, "checkpoints_yourtts_cml_tts_dataset/best_model.pth")

URL = "https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p"
OUTPUT_CHECKPOINTS_FILEPATH = os.path.join(OUT_PATH, "checkpoints_yourtts_cml_tts_dataset.tar.bz")

# Download the CML-TTS checkpoint if it does not exist
if not os.path.exists(RESTORE_PATH):
    print(f"Downloading the CML-TTS checkpoint from {URL}")
    gdown.download(url=URL, output=OUTPUT_CHECKPOINTS_FILEPATH, quiet=False, fuzzy=True)
    with tarfile.open(OUTPUT_CHECKPOINTS_FILEPATH, "r:bz2") as tar:
        tar.extractall(OUT_PATH)
else:
    print(f"Checkpoint already exists at {RESTORE_PATH}")

# This paramter is useful to debug, it skips the training epochs and just do the evaluation  and produce the test sentences
SKIP_TRAIN_EPOCH = False

# Set here the batch size to be used in training and evaluation
BATCH_SIZE = 4

# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
SAMPLE_RATE = 24000

# Max audio length in seconds to be used in training
MAX_AUDIO_LEN_IN_SECONDS = 11
# Min audio length in seconds to be used in training
MIN_AUDIO_LEN_IN_SECONDS = 0.8

dataset_conf = BaseDatasetConfig(
    dataset_name=f"{ISO}_openbible",
    meta_file_train="manifest_train.jsonl",
    meta_file_val="manifest_dev.jsonl",
    language=ISO,
    path="data/hausa/tts_data"
)

### Extract speaker embeddings
SPEAKER_ENCODER_CHECKPOINT_PATH = (
    "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
)
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"

D_VECTOR_FILES = []  # List of speaker embeddings/d-vectors to be used during the training

# Checks if the speakers embeddings are already computated, if not compute it
embeddings_file = os.path.join(dataset_conf.path, "speakers.pth")
if not os.path.isfile(embeddings_file):
    print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
    compute_embeddings(
        SPEAKER_ENCODER_CHECKPOINT_PATH,
        SPEAKER_ENCODER_CONFIG_PATH,
        embeddings_file,
        formatter=nemo,
        dataset_name=dataset_conf.dataset_name,
        dataset_path=dataset_conf.path,
        meta_file_train=dataset_conf.meta_file_train,
        meta_file_val=dataset_conf.meta_file_val,
    )
D_VECTOR_FILES.append(embeddings_file)

# Audio config used in training.
audio_config = VitsAudioConfig(
    sample_rate=SAMPLE_RATE,
    hop_length=256,
    win_length=1024,
    fft_size=1024,
    mel_fmin=0.0,
    mel_fmax=None,
    num_mels=80,
)

# Init VITSArgs setting the arguments that are needed for the YourTTS model
model_args = VitsArgs(
    spec_segment_size=62,
    hidden_channels=192,
    hidden_channels_ffn_text_encoder=768,
    num_heads_text_encoder=2,
    num_layers_text_encoder=10,
    kernel_size_text_encoder=3,
    dropout_p_text_encoder=0.1,
    d_vector_file=D_VECTOR_FILES,
    use_d_vector_file=True,
    d_vector_dim=512,
    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
    resblock_type_decoder="2",  # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
    # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
    use_speaker_encoder_as_loss=False,
    # Useful parameters to enable multilingual training
    use_language_embedding=True,
    embedded_language_dim=4,
)

CHARS = ["'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'w', 'y', 'z', 'ā', 'ă', 'ū', 'ƙ', 'ɓ', 'ɗ', '’']
PUNCT = [' ', '!', ',', '.', ':', ';', '?']

TEST_SENTENCES = [
    ["umarnai don zaman tsarki.", "two", None, "ha"],
    ["wanda kuma ya faɗa mana ƙaunar da kuke yi cikin ruhu.", "one", None, "ha"],
    ["gama mun ji labarin bangaskiyarku a cikin yesu kiristi da kuma ƙaunar da kuke yi saboda dukan tsarkaka.", "two", None, "ha"],
]

# General training config, here you can change the batch size and others useful parameters
config = VitsConfig(
    output_path=OUT_PATH,
    model_args=model_args,
    run_name=RUN_NAME,
    project_name="YourTTS",
    run_description=f"""
            - YourTTS trained using the {LANG_NAME.capitalize()} OpenBible dataset.
        """,
    dashboard_logger="tensorboard",
    logger_uri=None,
    audio=audio_config,
    batch_size=BATCH_SIZE,
    batch_group_size=4,
    eval_batch_size=BATCH_SIZE,
    num_loader_workers=8,
    # eval_split_max_size=256,
    print_step=50,
    plot_step=100,
    # log_model_step=1000,
    save_step=1000,
    save_n_checkpoints=2,
    save_checkpoints=True,
    target_loss="loss_1",
    print_eval=True,
    compute_input_seq_cache=True,
    add_blank=True,
    text_cleaner="no_cleaners",
    characters=CharactersConfig(
        characters_class="TTS.tts.models.vits.VitsCharacters",
        pad="_",
        eos="&",
        bos="*",
        blank=None,
        characters="".join(CHARS),
        punctuations="".join(PUNCT),
    ),
    phoneme_cache_path=None,
    precompute_num_workers=12,
    start_by_longest=True,
    datasets=[dataset_conf],
    cudnn_benchmark=False,
    min_audio_len=int(SAMPLE_RATE * MIN_AUDIO_LEN_IN_SECONDS),
    max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
    mixed_precision=True,
    test_sentences=TEST_SENTENCES,
    # Enable the weighted sampler
    # use_weighted_sampler=True,
    # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
    # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
    # weighted_sampler_attrs={"language": 1.0},
    # weighted_sampler_multipliers={
    #     # "speaker_name": {
    #     # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
    #     # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
    #     # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
    #     # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
    #     # }
    # },
    # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
    speaker_encoder_loss_alpha=9.0,
)

# Load all the datasets samples and split traning and evaluation sets
train_samples, eval_samples = load_tts_samples(
    config.datasets,
    eval_split=True,
    formatter=nemo,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)
print(f"Loaded {len(train_samples)} train samples")
print(f"Loaded {len(eval_samples)} eval samples")

# Init the model
model = Vits.init_from_config(config)

# Init the trainer and 🚀
trainer = Trainer(
    TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH),
    config,
    output_path=OUT_PATH,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)
trainer.fit()