Spaces:
Running
Running
| from dataclasses import dataclass, field | |
| from typing import List | |
| from TTS.tts.configs.shared_configs import BaseTTSConfig | |
| from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig | |
| class XttsConfig(BaseTTSConfig): | |
| """Defines parameters for XTTS TTS model. | |
| Args: | |
| model (str): | |
| Model name. Do not change unless you know what you are doing. | |
| model_args (XttsArgs): | |
| Model architecture arguments. Defaults to `XttsArgs()`. | |
| audio (XttsAudioConfig): | |
| Audio processing configuration. Defaults to `XttsAudioConfig()`. | |
| model_dir (str): | |
| Path to the folder that has all the XTTS models. Defaults to None. | |
| temperature (float): | |
| Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`. | |
| length_penalty (float): | |
| Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length, | |
| which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), | |
| length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences. | |
| repetition_penalty (float): | |
| The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`. | |
| top_p (float): | |
| If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. | |
| Defaults to `0.8`. | |
| num_gpt_outputs (int): | |
| Number of samples taken from the autoregressive model, all of which are filtered using CLVP. | |
| As XTTS is a probabilistic model, more samples means a higher probability of creating something "great". | |
| Defaults to `16`. | |
| gpt_cond_len (int): | |
| Secs audio to be used as conditioning for the autoregressive model. Defaults to `12`. | |
| gpt_cond_chunk_len (int): | |
| Audio chunk size in secs. Audio is split into chunks and latents are extracted for each chunk. Then the | |
| latents are averaged. Chunking improves the stability. It must be <= gpt_cond_len. | |
| If gpt_cond_len == gpt_cond_chunk_len, no chunking. Defaults to `4`. | |
| max_ref_len (int): | |
| Maximum number of seconds of audio to be used as conditioning for the decoder. Defaults to `10`. | |
| sound_norm_refs (bool): | |
| Whether to normalize the conditioning audio. Defaults to `False`. | |
| Note: | |
| Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. | |
| Example: | |
| >>> from TTS.tts.configs.xtts_config import XttsConfig | |
| >>> config = XttsConfig() | |
| """ | |
| model: str = "xtts" | |
| # model specific params | |
| model_args: XttsArgs = field(default_factory=XttsArgs) | |
| audio: XttsAudioConfig = field(default_factory=XttsAudioConfig) | |
| model_dir: str = None | |
| languages: List[str] = field( | |
| default_factory=lambda: [ | |
| "en", | |
| "es", | |
| "fr", | |
| "de", | |
| "it", | |
| "pt", | |
| "pl", | |
| "tr", | |
| "ru", | |
| "nl", | |
| "cs", | |
| "ar", | |
| "zh-cn", | |
| "hu", | |
| "ko", | |
| "ja", | |
| ] | |
| ) | |
| # inference params | |
| temperature: float = 0.85 | |
| length_penalty: float = 1.0 | |
| repetition_penalty: float = 2.0 | |
| top_k: int = 50 | |
| top_p: float = 0.85 | |
| num_gpt_outputs: int = 1 | |
| # cloning | |
| gpt_cond_len: int = 12 | |
| gpt_cond_chunk_len: int = 4 | |
| max_ref_len: int = 10 | |
| sound_norm_refs: bool = False | |