YourTTS-Hausa / train_yourtts_hausa.py

Upload folder using huggingface_hub

183464b verified 18 days ago

12 kB

	import os

	import torch
	from trainer import Trainer, TrainerArgs

	from TTS.config import load_config
	from TTS.config.shared_configs import BaseDatasetConfig
	from TTS.tts.configs.vits_config import VitsConfig
	from TTS.tts.datasets import load_tts_samples
	from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig
	from TTS.tts.utils.speakers import SpeakerManager
	from TTS.tts.utils.managers import save_file
	from tqdm import tqdm
	import json
	# import gdown
	import tarfile

	torch.set_num_threads(24)


	def nemo(root_path, meta_file, **kwargs):
	"""
	Normalizes NeMo-style json manifest files to TTS format
	"""
	meta_path = os.path.join(root_path, meta_file)
	items = []
	with open(meta_path, "r", encoding="utf-8") as ttf:
	for line in ttf:
	cols = json.loads(line)
	wav_file = cols["audio_filepath"]
	text = cols["text"]
	speaker_name = cols["speaker_name"] if "speaker_name" in cols else "one"
	language = cols["language"] if "language" in cols else ""
	items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "language": language, "root_path": root_path})
	return items


	def compute_embeddings(
	model_path,
	config_path,
	output_path,
	old_speakers_file=None,
	old_append=False,
	config_dataset_path=None,
	formatter=None,
	dataset_name=None,
	dataset_path=None,
	meta_file_train=None,
	meta_file_val=None,
	disable_cuda=False,
	no_eval=False,
	):
	use_cuda = torch.cuda.is_available() and not disable_cuda

	if config_dataset_path is not None:
	c_dataset = load_config(config_dataset_path)
	meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
	else:
	c_dataset = BaseDatasetConfig()
	c_dataset.dataset_name = dataset_name
	c_dataset.path = dataset_path
	if meta_file_train is not None:
	c_dataset.meta_file_train = meta_file_train
	if meta_file_val is not None:
	c_dataset.meta_file_val = meta_file_val
	meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval, formatter=formatter)

	if meta_data_eval is None:
	samples = meta_data_train
	else:
	samples = meta_data_train + meta_data_eval

	encoder_manager = SpeakerManager(
	encoder_model_path=model_path,
	encoder_config_path=config_path,
	d_vectors_file_path=old_speakers_file,
	use_cuda=use_cuda,
	)

	class_name_key = encoder_manager.encoder_config.class_name_key

	# compute speaker embeddings
	if old_speakers_file is not None and old_append:
	speaker_mapping = encoder_manager.embeddings
	else:
	speaker_mapping = {}

	for fields in tqdm(samples):
	class_name = fields[class_name_key]
	audio_file = fields["audio_file"]
	embedding_key = fields["audio_unique_name"]

	# Only update the speaker name when the embedding is already in the old file.
	if embedding_key in speaker_mapping:
	speaker_mapping[embedding_key]["name"] = class_name
	continue

	if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
	# get the embedding from the old file
	embedd = encoder_manager.get_embedding_by_clip(embedding_key)
	else:
	# extract the embedding
	embedd = encoder_manager.compute_embedding_from_clip(audio_file)

	# create speaker_mapping if target dataset is defined
	speaker_mapping[embedding_key] = {}
	speaker_mapping[embedding_key]["name"] = class_name
	speaker_mapping[embedding_key]["embedding"] = embedd

	if speaker_mapping:
	# save speaker_mapping if target dataset is defined
	if os.path.isdir(output_path):
	mapping_file_path = os.path.join(output_path, "speakers.pth")
	else:
	mapping_file_path = output_path

	if os.path.dirname(mapping_file_path) != "":
	os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)

	save_file(speaker_mapping, mapping_file_path)
	print("Speaker embeddings saved at:", mapping_file_path)

	OUT_PATH = "yourtts_hausa"
	LANG_NAME = "hausa"
	ISO = "ha"

	# Name of the run for the Trainer
	RUN_NAME = f"YourTTS-{LANG_NAME.capitalize()}"

	# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that can be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
	RESTORE_PATH = os.path.join(OUT_PATH, "checkpoints_yourtts_cml_tts_dataset/best_model.pth")

	URL = "https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p"
	OUTPUT_CHECKPOINTS_FILEPATH = os.path.join(OUT_PATH, "checkpoints_yourtts_cml_tts_dataset.tar.bz")

	# Download the CML-TTS checkpoint if it does not exist
	if not os.path.exists(RESTORE_PATH):
	print(f"Downloading the CML-TTS checkpoint from {URL}")
	gdown.download(url=URL, output=OUTPUT_CHECKPOINTS_FILEPATH, quiet=False, fuzzy=True)
	with tarfile.open(OUTPUT_CHECKPOINTS_FILEPATH, "r:bz2") as tar:
	tar.extractall(OUT_PATH)
	else:
	print(f"Checkpoint already exists at {RESTORE_PATH}")

	# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
	SKIP_TRAIN_EPOCH = False

	# Set here the batch size to be used in training and evaluation
	BATCH_SIZE = 4

	# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
	SAMPLE_RATE = 24000

	# Max audio length in seconds to be used in training
	MAX_AUDIO_LEN_IN_SECONDS = 11
	# Min audio length in seconds to be used in training
	MIN_AUDIO_LEN_IN_SECONDS = 0.8

	dataset_conf = BaseDatasetConfig(
	dataset_name=f"{ISO}_openbible",
	meta_file_train="manifest_train.jsonl",
	meta_file_val="manifest_dev.jsonl",
	language=ISO,
	path="data/hausa/tts_data"
	)

	### Extract speaker embeddings
	SPEAKER_ENCODER_CHECKPOINT_PATH = (
	"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
	)
	SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"

	D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training

	# Checks if the speakers embeddings are already computated, if not compute it
	embeddings_file = os.path.join(dataset_conf.path, "speakers.pth")
	if not os.path.isfile(embeddings_file):
	print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
	compute_embeddings(
	SPEAKER_ENCODER_CHECKPOINT_PATH,
	SPEAKER_ENCODER_CONFIG_PATH,
	embeddings_file,
	formatter=nemo,
	dataset_name=dataset_conf.dataset_name,
	dataset_path=dataset_conf.path,
	meta_file_train=dataset_conf.meta_file_train,
	meta_file_val=dataset_conf.meta_file_val,
	)
	D_VECTOR_FILES.append(embeddings_file)

	# Audio config used in training.
	audio_config = VitsAudioConfig(
	sample_rate=SAMPLE_RATE,
	hop_length=256,
	win_length=1024,
	fft_size=1024,
	mel_fmin=0.0,
	mel_fmax=None,
	num_mels=80,
	)

	# Init VITSArgs setting the arguments that are needed for the YourTTS model
	model_args = VitsArgs(
	spec_segment_size=62,
	hidden_channels=192,
	hidden_channels_ffn_text_encoder=768,
	num_heads_text_encoder=2,
	num_layers_text_encoder=10,
	kernel_size_text_encoder=3,
	dropout_p_text_encoder=0.1,
	d_vector_file=D_VECTOR_FILES,
	use_d_vector_file=True,
	d_vector_dim=512,
	speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
	speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
	resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
	# Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
	use_speaker_encoder_as_loss=False,
	# Useful parameters to enable multilingual training
	use_language_embedding=True,
	embedded_language_dim=4,
	)

	CHARS = ["'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'w', 'y', 'z', 'ā', 'ă', 'ū', 'ƙ', 'ɓ', 'ɗ', '’']
	PUNCT = [' ', '!', ',', '.', ':', ';', '?']

	TEST_SENTENCES = [
	["umarnai don zaman tsarki.", "two", None, "ha"],
	["wanda kuma ya faɗa mana ƙaunar da kuke yi cikin ruhu.", "one", None, "ha"],
	["gama mun ji labarin bangaskiyarku a cikin yesu kiristi da kuma ƙaunar da kuke yi saboda dukan tsarkaka.", "two", None, "ha"],
	]

	# General training config, here you can change the batch size and others useful parameters
	config = VitsConfig(
	output_path=OUT_PATH,
	model_args=model_args,
	run_name=RUN_NAME,
	project_name="YourTTS",
	run_description=f"""
	- YourTTS trained using the {LANG_NAME.capitalize()} OpenBible dataset.
	""",
	dashboard_logger="tensorboard",
	logger_uri=None,
	audio=audio_config,
	batch_size=BATCH_SIZE,
	batch_group_size=4,
	eval_batch_size=BATCH_SIZE,
	num_loader_workers=8,
	# eval_split_max_size=256,
	print_step=50,
	plot_step=100,
	# log_model_step=1000,
	save_step=1000,
	save_n_checkpoints=2,
	save_checkpoints=True,
	target_loss="loss_1",
	print_eval=True,
	compute_input_seq_cache=True,
	add_blank=True,
	text_cleaner="no_cleaners",
	characters=CharactersConfig(
	characters_class="TTS.tts.models.vits.VitsCharacters",
	pad="_",
	eos="&",
	bos="*",
	blank=None,
	characters="".join(CHARS),
	punctuations="".join(PUNCT),
	),
	phoneme_cache_path=None,
	precompute_num_workers=12,
	start_by_longest=True,
	datasets=[dataset_conf],
	cudnn_benchmark=False,
	min_audio_len=int(SAMPLE_RATE * MIN_AUDIO_LEN_IN_SECONDS),
	max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
	mixed_precision=True,
	test_sentences=TEST_SENTENCES,
	# Enable the weighted sampler
	# use_weighted_sampler=True,
	# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
	# weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
	# weighted_sampler_attrs={"language": 1.0},
	# weighted_sampler_multipliers={
	# # "speaker_name": {
	# # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
	# # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
	# # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
	# # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
	# # }
	# },
	# It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
	speaker_encoder_loss_alpha=9.0,
	)

	# Load all the datasets samples and split traning and evaluation sets
	train_samples, eval_samples = load_tts_samples(
	config.datasets,
	eval_split=True,
	formatter=nemo,
	eval_split_max_size=config.eval_split_max_size,
	eval_split_size=config.eval_split_size,
	)
	print(f"Loaded {len(train_samples)} train samples")
	print(f"Loaded {len(eval_samples)} eval samples")

	# Init the model
	model = Vits.init_from_config(config)

	# Init the trainer and 🚀
	trainer = Trainer(
	TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH),
	config,
	output_path=OUT_PATH,
	model=model,
	train_samples=train_samples,
	eval_samples=eval_samples,
	)
	trainer.fit()