voice-clone with single audio sample input
Browse filesThis view is limited to 50 files because it contains too many changes.  
							See raw diff
- TTS/.models.json +937 -0
- TTS/VERSION +1 -0
- TTS/__init__.py +6 -0
- TTS/__pycache__/__init__.cpython-39.pyc +0 -0
- TTS/__pycache__/api.cpython-39.pyc +0 -0
- TTS/__pycache__/cs_api.cpython-39.pyc +0 -0
- TTS/__pycache__/model.cpython-39.pyc +0 -0
- TTS/api.py +489 -0
- TTS/bin/__init__.py +0 -0
- TTS/bin/collect_env_info.py +48 -0
- TTS/bin/compute_attention_masks.py +165 -0
- TTS/bin/compute_embeddings.py +197 -0
- TTS/bin/compute_statistics.py +96 -0
- TTS/bin/eval_encoder.py +88 -0
- TTS/bin/extract_tts_spectrograms.py +287 -0
- TTS/bin/find_unique_chars.py +45 -0
- TTS/bin/find_unique_phonemes.py +74 -0
- TTS/bin/remove_silence_using_vad.py +124 -0
- TTS/bin/resample.py +90 -0
- TTS/bin/synthesize.py +541 -0
- TTS/bin/train_encoder.py +319 -0
- TTS/bin/train_tts.py +71 -0
- TTS/bin/train_vocoder.py +77 -0
- TTS/bin/tune_wavegrad.py +103 -0
- TTS/config/__init__.py +138 -0
- TTS/config/__pycache__/__init__.cpython-39.pyc +0 -0
- TTS/config/__pycache__/shared_configs.cpython-39.pyc +0 -0
- TTS/config/shared_configs.py +268 -0
- TTS/cs_api.py +317 -0
- TTS/encoder/README.md +18 -0
- TTS/encoder/__init__.py +0 -0
- TTS/encoder/__pycache__/__init__.cpython-39.pyc +0 -0
- TTS/encoder/__pycache__/losses.cpython-39.pyc +0 -0
- TTS/encoder/configs/base_encoder_config.py +61 -0
- TTS/encoder/configs/emotion_encoder_config.py +12 -0
- TTS/encoder/configs/speaker_encoder_config.py +11 -0
- TTS/encoder/dataset.py +147 -0
- TTS/encoder/losses.py +226 -0
- TTS/encoder/models/__pycache__/base_encoder.cpython-39.pyc +0 -0
- TTS/encoder/models/__pycache__/lstm.cpython-39.pyc +0 -0
- TTS/encoder/models/__pycache__/resnet.cpython-39.pyc +0 -0
- TTS/encoder/models/base_encoder.py +161 -0
- TTS/encoder/models/lstm.py +99 -0
- TTS/encoder/models/resnet.py +198 -0
- TTS/encoder/requirements.txt +2 -0
- TTS/encoder/utils/__init__.py +0 -0
- TTS/encoder/utils/__pycache__/__init__.cpython-39.pyc +0 -0
- TTS/encoder/utils/__pycache__/generic_utils.cpython-39.pyc +0 -0
- TTS/encoder/utils/generic_utils.py +182 -0
- TTS/encoder/utils/io.py +38 -0
    	
        TTS/.models.json
    ADDED
    
    | @@ -0,0 +1,937 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "tts_models": {
         | 
| 3 | 
            +
                    "multilingual": {
         | 
| 4 | 
            +
                        "multi-dataset": {
         | 
| 5 | 
            +
                            "xtts_v2": {
         | 
| 6 | 
            +
                                "description": "XTTS-v2.0.2 by Coqui with 16 languages.",
         | 
| 7 | 
            +
                                "hf_url": [
         | 
| 8 | 
            +
                                    "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
         | 
| 9 | 
            +
                                    "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
         | 
| 10 | 
            +
                                    "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
         | 
| 11 | 
            +
                                    "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
         | 
| 12 | 
            +
                                ],
         | 
| 13 | 
            +
                                "model_hash": "5ce0502bfe3bc88dc8d9312b12a7558c",
         | 
| 14 | 
            +
                                "default_vocoder": null,
         | 
| 15 | 
            +
                                "commit": "480a6cdf7",
         | 
| 16 | 
            +
                                "license": "CPML",
         | 
| 17 | 
            +
                                "contact": "[email protected]",
         | 
| 18 | 
            +
                                "tos_required": true
         | 
| 19 | 
            +
                            },
         | 
| 20 | 
            +
                            "xtts_v1.1": {
         | 
| 21 | 
            +
                                "description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
         | 
| 22 | 
            +
                                "hf_url": [
         | 
| 23 | 
            +
                                    "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth",
         | 
| 24 | 
            +
                                    "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/config.json",
         | 
| 25 | 
            +
                                    "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json",
         | 
| 26 | 
            +
                                    "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/hash.md5"
         | 
| 27 | 
            +
                                ],
         | 
| 28 | 
            +
                                "model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
         | 
| 29 | 
            +
                                "default_vocoder": null,
         | 
| 30 | 
            +
                                "commit": "82910a63",
         | 
| 31 | 
            +
                                "license": "CPML",
         | 
| 32 | 
            +
                                "contact": "[email protected]",
         | 
| 33 | 
            +
                                "tos_required": true
         | 
| 34 | 
            +
                            },
         | 
| 35 | 
            +
                            "your_tts": {
         | 
| 36 | 
            +
                                "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
         | 
| 37 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
         | 
| 38 | 
            +
                                "default_vocoder": null,
         | 
| 39 | 
            +
                                "commit": "e9a1953e",
         | 
| 40 | 
            +
                                "license": "CC BY-NC-ND 4.0",
         | 
| 41 | 
            +
                                "contact": "[email protected]"
         | 
| 42 | 
            +
                            },
         | 
| 43 | 
            +
                            "bark": {
         | 
| 44 | 
            +
                                "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
         | 
| 45 | 
            +
                                "hf_url": [
         | 
| 46 | 
            +
                                    "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
         | 
| 47 | 
            +
                                    "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
         | 
| 48 | 
            +
                                    "https://app.coqui.ai/tts_model/text_2.pt",
         | 
| 49 | 
            +
                                    "https://coqui.gateway.scarf.sh/hf/bark/config.json",
         | 
| 50 | 
            +
                                    "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
         | 
| 51 | 
            +
                                    "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
         | 
| 52 | 
            +
                                ],
         | 
| 53 | 
            +
                                "default_vocoder": null,
         | 
| 54 | 
            +
                                "commit": "e9a1953e",
         | 
| 55 | 
            +
                                "license": "MIT",
         | 
| 56 | 
            +
                                "contact": "https://www.suno.ai/"
         | 
| 57 | 
            +
                            }
         | 
| 58 | 
            +
                        }
         | 
| 59 | 
            +
                    },
         | 
| 60 | 
            +
                    "bg": {
         | 
| 61 | 
            +
                        "cv": {
         | 
| 62 | 
            +
                            "vits": {
         | 
| 63 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
         | 
| 64 | 
            +
                                "default_vocoder": null,
         | 
| 65 | 
            +
                                "commit": null,
         | 
| 66 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 67 | 
            +
                                "license": "bsd-3-clause"
         | 
| 68 | 
            +
                            }
         | 
| 69 | 
            +
                        }
         | 
| 70 | 
            +
                    },
         | 
| 71 | 
            +
                    "cs": {
         | 
| 72 | 
            +
                        "cv": {
         | 
| 73 | 
            +
                            "vits": {
         | 
| 74 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
         | 
| 75 | 
            +
                                "default_vocoder": null,
         | 
| 76 | 
            +
                                "commit": null,
         | 
| 77 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 78 | 
            +
                                "license": "bsd-3-clause"
         | 
| 79 | 
            +
                            }
         | 
| 80 | 
            +
                        }
         | 
| 81 | 
            +
                    },
         | 
| 82 | 
            +
                    "da": {
         | 
| 83 | 
            +
                        "cv": {
         | 
| 84 | 
            +
                            "vits": {
         | 
| 85 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
         | 
| 86 | 
            +
                                "default_vocoder": null,
         | 
| 87 | 
            +
                                "commit": null,
         | 
| 88 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 89 | 
            +
                                "license": "bsd-3-clause"
         | 
| 90 | 
            +
                            }
         | 
| 91 | 
            +
                        }
         | 
| 92 | 
            +
                    },
         | 
| 93 | 
            +
                    "et": {
         | 
| 94 | 
            +
                        "cv": {
         | 
| 95 | 
            +
                            "vits": {
         | 
| 96 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
         | 
| 97 | 
            +
                                "default_vocoder": null,
         | 
| 98 | 
            +
                                "commit": null,
         | 
| 99 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 100 | 
            +
                                "license": "bsd-3-clause"
         | 
| 101 | 
            +
                            }
         | 
| 102 | 
            +
                        }
         | 
| 103 | 
            +
                    },
         | 
| 104 | 
            +
                    "ga": {
         | 
| 105 | 
            +
                        "cv": {
         | 
| 106 | 
            +
                            "vits": {
         | 
| 107 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
         | 
| 108 | 
            +
                                "default_vocoder": null,
         | 
| 109 | 
            +
                                "commit": null,
         | 
| 110 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 111 | 
            +
                                "license": "bsd-3-clause"
         | 
| 112 | 
            +
                            }
         | 
| 113 | 
            +
                        }
         | 
| 114 | 
            +
                    },
         | 
| 115 | 
            +
                    "en": {
         | 
| 116 | 
            +
                        "ek1": {
         | 
| 117 | 
            +
                            "tacotron2": {
         | 
| 118 | 
            +
                                "description": "EK1 en-rp tacotron2 by NMStoker",
         | 
| 119 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
         | 
| 120 | 
            +
                                "default_vocoder": "vocoder_models/en/ek1/wavegrad",
         | 
| 121 | 
            +
                                "commit": "c802255",
         | 
| 122 | 
            +
                                "license": "apache 2.0"
         | 
| 123 | 
            +
                            }
         | 
| 124 | 
            +
                        },
         | 
| 125 | 
            +
                        "ljspeech": {
         | 
| 126 | 
            +
                            "tacotron2-DDC": {
         | 
| 127 | 
            +
                                "description": "Tacotron2 with Double Decoder Consistency.",
         | 
| 128 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
         | 
| 129 | 
            +
                                "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
         | 
| 130 | 
            +
                                "commit": "bae2ad0f",
         | 
| 131 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 132 | 
            +
                                "license": "apache 2.0",
         | 
| 133 | 
            +
                                "contact": "[email protected]"
         | 
| 134 | 
            +
                            },
         | 
| 135 | 
            +
                            "tacotron2-DDC_ph": {
         | 
| 136 | 
            +
                                "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
         | 
| 137 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
         | 
| 138 | 
            +
                                "default_vocoder": "vocoder_models/en/ljspeech/univnet",
         | 
| 139 | 
            +
                                "commit": "3900448",
         | 
| 140 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 141 | 
            +
                                "license": "apache 2.0",
         | 
| 142 | 
            +
                                "contact": "[email protected]"
         | 
| 143 | 
            +
                            },
         | 
| 144 | 
            +
                            "glow-tts": {
         | 
| 145 | 
            +
                                "description": "",
         | 
| 146 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
         | 
| 147 | 
            +
                                "stats_file": null,
         | 
| 148 | 
            +
                                "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
         | 
| 149 | 
            +
                                "commit": "",
         | 
| 150 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 151 | 
            +
                                "license": "MPL",
         | 
| 152 | 
            +
                                "contact": "[email protected]"
         | 
| 153 | 
            +
                            },
         | 
| 154 | 
            +
                            "speedy-speech": {
         | 
| 155 | 
            +
                                "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
         | 
| 156 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
         | 
| 157 | 
            +
                                "stats_file": null,
         | 
| 158 | 
            +
                                "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
         | 
| 159 | 
            +
                                "commit": "4581e3d",
         | 
| 160 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 161 | 
            +
                                "license": "apache 2.0",
         | 
| 162 | 
            +
                                "contact": "[email protected]"
         | 
| 163 | 
            +
                            },
         | 
| 164 | 
            +
                            "tacotron2-DCA": {
         | 
| 165 | 
            +
                                "description": "",
         | 
| 166 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
         | 
| 167 | 
            +
                                "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
         | 
| 168 | 
            +
                                "commit": "",
         | 
| 169 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 170 | 
            +
                                "license": "MPL",
         | 
| 171 | 
            +
                                "contact": "[email protected]"
         | 
| 172 | 
            +
                            },
         | 
| 173 | 
            +
                            "vits": {
         | 
| 174 | 
            +
                                "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
         | 
| 175 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
         | 
| 176 | 
            +
                                "default_vocoder": null,
         | 
| 177 | 
            +
                                "commit": "3900448",
         | 
| 178 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 179 | 
            +
                                "license": "apache 2.0",
         | 
| 180 | 
            +
                                "contact": "[email protected]"
         | 
| 181 | 
            +
                            },
         | 
| 182 | 
            +
                            "vits--neon": {
         | 
| 183 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
         | 
| 184 | 
            +
                                "default_vocoder": null,
         | 
| 185 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 186 | 
            +
                                "license": "bsd-3-clause",
         | 
| 187 | 
            +
                                "contact": null,
         | 
| 188 | 
            +
                                "commit": null
         | 
| 189 | 
            +
                            },
         | 
| 190 | 
            +
                            "fast_pitch": {
         | 
| 191 | 
            +
                                "description": "FastPitch model trained on LJSpeech using the Aligner Network",
         | 
| 192 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
         | 
| 193 | 
            +
                                "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
         | 
| 194 | 
            +
                                "commit": "b27b3ba",
         | 
| 195 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 196 | 
            +
                                "license": "apache 2.0",
         | 
| 197 | 
            +
                                "contact": "[email protected]"
         | 
| 198 | 
            +
                            },
         | 
| 199 | 
            +
                            "overflow": {
         | 
| 200 | 
            +
                                "description": "Overflow model trained on LJSpeech",
         | 
| 201 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
         | 
| 202 | 
            +
                                "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
         | 
| 203 | 
            +
                                "commit": "3b1a28f",
         | 
| 204 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 205 | 
            +
                                "license": "apache 2.0",
         | 
| 206 | 
            +
                                "contact": "[email protected]"
         | 
| 207 | 
            +
                            },
         | 
| 208 | 
            +
                            "neural_hmm": {
         | 
| 209 | 
            +
                                "description": "Neural HMM model trained on LJSpeech",
         | 
| 210 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
         | 
| 211 | 
            +
                                "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
         | 
| 212 | 
            +
                                "commit": "3b1a28f",
         | 
| 213 | 
            +
                                "author": "Shivam Metha @shivammehta25",
         | 
| 214 | 
            +
                                "license": "apache 2.0",
         | 
| 215 | 
            +
                                "contact": "d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4"
         | 
| 216 | 
            +
                            }
         | 
| 217 | 
            +
                        },
         | 
| 218 | 
            +
                        "vctk": {
         | 
| 219 | 
            +
                            "vits": {
         | 
| 220 | 
            +
                                "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
         | 
| 221 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
         | 
| 222 | 
            +
                                "default_vocoder": null,
         | 
| 223 | 
            +
                                "commit": "3900448",
         | 
| 224 | 
            +
                                "author": "Eren @erogol",
         | 
| 225 | 
            +
                                "license": "apache 2.0",
         | 
| 226 | 
            +
                                "contact": "[email protected]"
         | 
| 227 | 
            +
                            },
         | 
| 228 | 
            +
                            "fast_pitch": {
         | 
| 229 | 
            +
                                "description": "FastPitch model trained on VCTK dataseset.",
         | 
| 230 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
         | 
| 231 | 
            +
                                "default_vocoder": null,
         | 
| 232 | 
            +
                                "commit": "bdab788d",
         | 
| 233 | 
            +
                                "author": "Eren @erogol",
         | 
| 234 | 
            +
                                "license": "CC BY-NC-ND 4.0",
         | 
| 235 | 
            +
                                "contact": "[email protected]"
         | 
| 236 | 
            +
                            }
         | 
| 237 | 
            +
                        },
         | 
| 238 | 
            +
                        "sam": {
         | 
| 239 | 
            +
                            "tacotron-DDC": {
         | 
| 240 | 
            +
                                "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
         | 
| 241 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
         | 
| 242 | 
            +
                                "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
         | 
| 243 | 
            +
                                "commit": "bae2ad0f",
         | 
| 244 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 245 | 
            +
                                "license": "apache 2.0",
         | 
| 246 | 
            +
                                "contact": "[email protected]"
         | 
| 247 | 
            +
                            }
         | 
| 248 | 
            +
                        },
         | 
| 249 | 
            +
                        "blizzard2013": {
         | 
| 250 | 
            +
                            "capacitron-t2-c50": {
         | 
| 251 | 
            +
                                "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
         | 
| 252 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
         | 
| 253 | 
            +
                                "commit": "d6284e7",
         | 
| 254 | 
            +
                                "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
         | 
| 255 | 
            +
                                "author": "Adam Froghyar @a-froghyar",
         | 
| 256 | 
            +
                                "license": "apache 2.0",
         | 
| 257 | 
            +
                                "contact": "[email protected]"
         | 
| 258 | 
            +
                            },
         | 
| 259 | 
            +
                            "capacitron-t2-c150_v2": {
         | 
| 260 | 
            +
                                "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
         | 
| 261 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
         | 
| 262 | 
            +
                                "commit": "a67039d",
         | 
| 263 | 
            +
                                "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
         | 
| 264 | 
            +
                                "author": "Adam Froghyar @a-froghyar",
         | 
| 265 | 
            +
                                "license": "apache 2.0",
         | 
| 266 | 
            +
                                "contact": "[email protected]"
         | 
| 267 | 
            +
                            }
         | 
| 268 | 
            +
                        },
         | 
| 269 | 
            +
                        "multi-dataset": {
         | 
| 270 | 
            +
                            "tortoise-v2": {
         | 
| 271 | 
            +
                                "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
         | 
| 272 | 
            +
                                "github_rls_url": [
         | 
| 273 | 
            +
                                    "https://app.coqui.ai/tts_model/autoregressive.pth",
         | 
| 274 | 
            +
                                    "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
         | 
| 275 | 
            +
                                    "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
         | 
| 276 | 
            +
                                    "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
         | 
| 277 | 
            +
                                    "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
         | 
| 278 | 
            +
                                    "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
         | 
| 279 | 
            +
                                    "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
         | 
| 280 | 
            +
                                    "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
         | 
| 281 | 
            +
                                    "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
         | 
| 282 | 
            +
                                ],
         | 
| 283 | 
            +
                                "commit": "c1875f6",
         | 
| 284 | 
            +
                                "default_vocoder": null,
         | 
| 285 | 
            +
                                "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
         | 
| 286 | 
            +
                                "license": "apache 2.0"
         | 
| 287 | 
            +
                            }
         | 
| 288 | 
            +
                        },
         | 
| 289 | 
            +
                        "jenny": {
         | 
| 290 | 
            +
                            "jenny": {
         | 
| 291 | 
            +
                                "description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
         | 
| 292 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
         | 
| 293 | 
            +
                                "default_vocoder": null,
         | 
| 294 | 
            +
                                "commit": "ba40a1c",
         | 
| 295 | 
            +
                                "license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
         | 
| 296 | 
            +
                                "author": "@noml4u"
         | 
| 297 | 
            +
                            }
         | 
| 298 | 
            +
                        }
         | 
| 299 | 
            +
                    },
         | 
| 300 | 
            +
                    "es": {
         | 
| 301 | 
            +
                        "mai": {
         | 
| 302 | 
            +
                            "tacotron2-DDC": {
         | 
| 303 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
         | 
| 304 | 
            +
                                "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
         | 
| 305 | 
            +
                                "commit": "",
         | 
| 306 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 307 | 
            +
                                "license": "MPL",
         | 
| 308 | 
            +
                                "contact": "[email protected]"
         | 
| 309 | 
            +
                            }
         | 
| 310 | 
            +
                        },
         | 
| 311 | 
            +
                        "css10": {
         | 
| 312 | 
            +
                            "vits": {
         | 
| 313 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
         | 
| 314 | 
            +
                                "default_vocoder": null,
         | 
| 315 | 
            +
                                "commit": null,
         | 
| 316 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 317 | 
            +
                                "license": "bsd-3-clause"
         | 
| 318 | 
            +
                            }
         | 
| 319 | 
            +
                        }
         | 
| 320 | 
            +
                    },
         | 
| 321 | 
            +
                    "fr": {
         | 
| 322 | 
            +
                        "mai": {
         | 
| 323 | 
            +
                            "tacotron2-DDC": {
         | 
| 324 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
         | 
| 325 | 
            +
                                "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
         | 
| 326 | 
            +
                                "commit": null,
         | 
| 327 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 328 | 
            +
                                "license": "MPL",
         | 
| 329 | 
            +
                                "contact": "[email protected]"
         | 
| 330 | 
            +
                            }
         | 
| 331 | 
            +
                        },
         | 
| 332 | 
            +
                        "css10": {
         | 
| 333 | 
            +
                            "vits": {
         | 
| 334 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
         | 
| 335 | 
            +
                                "default_vocoder": null,
         | 
| 336 | 
            +
                                "commit": null,
         | 
| 337 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 338 | 
            +
                                "license": "bsd-3-clause"
         | 
| 339 | 
            +
                            }
         | 
| 340 | 
            +
                        }
         | 
| 341 | 
            +
                    },
         | 
| 342 | 
            +
                    "uk": {
         | 
| 343 | 
            +
                        "mai": {
         | 
| 344 | 
            +
                            "glow-tts": {
         | 
| 345 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
         | 
| 346 | 
            +
                                "author": "@robinhad",
         | 
| 347 | 
            +
                                "commit": "bdab788d",
         | 
| 348 | 
            +
                                "license": "MIT",
         | 
| 349 | 
            +
                                "contact": "",
         | 
| 350 | 
            +
                                "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
         | 
| 351 | 
            +
                            },
         | 
| 352 | 
            +
                            "vits": {
         | 
| 353 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
         | 
| 354 | 
            +
                                "default_vocoder": null,
         | 
| 355 | 
            +
                                "commit": null,
         | 
| 356 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 357 | 
            +
                                "license": "bsd-3-clause"
         | 
| 358 | 
            +
                            }
         | 
| 359 | 
            +
                        }
         | 
| 360 | 
            +
                    },
         | 
| 361 | 
            +
                    "zh-CN": {
         | 
| 362 | 
            +
                        "baker": {
         | 
| 363 | 
            +
                            "tacotron2-DDC-GST": {
         | 
| 364 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
         | 
| 365 | 
            +
                                "commit": "unknown",
         | 
| 366 | 
            +
                                "author": "@kirianguiller",
         | 
| 367 | 
            +
                                "license": "apache 2.0",
         | 
| 368 | 
            +
                                "default_vocoder": null
         | 
| 369 | 
            +
                            }
         | 
| 370 | 
            +
                        }
         | 
| 371 | 
            +
                    },
         | 
| 372 | 
            +
                    "nl": {
         | 
| 373 | 
            +
                        "mai": {
         | 
| 374 | 
            +
                            "tacotron2-DDC": {
         | 
| 375 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
         | 
| 376 | 
            +
                                "author": "@r-dh",
         | 
| 377 | 
            +
                                "license": "apache 2.0",
         | 
| 378 | 
            +
                                "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
         | 
| 379 | 
            +
                                "stats_file": null,
         | 
| 380 | 
            +
                                "commit": "540d811"
         | 
| 381 | 
            +
                            }
         | 
| 382 | 
            +
                        },
         | 
| 383 | 
            +
                        "css10": {
         | 
| 384 | 
            +
                            "vits": {
         | 
| 385 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
         | 
| 386 | 
            +
                                "default_vocoder": null,
         | 
| 387 | 
            +
                                "commit": null,
         | 
| 388 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 389 | 
            +
                                "license": "bsd-3-clause"
         | 
| 390 | 
            +
                            }
         | 
| 391 | 
            +
                        }
         | 
| 392 | 
            +
                    },
         | 
| 393 | 
            +
                    "de": {
         | 
| 394 | 
            +
                        "thorsten": {
         | 
| 395 | 
            +
                            "tacotron2-DCA": {
         | 
| 396 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
         | 
| 397 | 
            +
                                "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
         | 
| 398 | 
            +
                                "author": "@thorstenMueller",
         | 
| 399 | 
            +
                                "license": "apache 2.0",
         | 
| 400 | 
            +
                                "commit": "unknown"
         | 
| 401 | 
            +
                            },
         | 
| 402 | 
            +
                            "vits": {
         | 
| 403 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
         | 
| 404 | 
            +
                                "default_vocoder": null,
         | 
| 405 | 
            +
                                "author": "@thorstenMueller",
         | 
| 406 | 
            +
                                "license": "apache 2.0",
         | 
| 407 | 
            +
                                "commit": "unknown"
         | 
| 408 | 
            +
                            },
         | 
| 409 | 
            +
                            "tacotron2-DDC": {
         | 
| 410 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
         | 
| 411 | 
            +
                                "default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
         | 
| 412 | 
            +
                                "description": "Thorsten-Dec2021-22k-DDC",
         | 
| 413 | 
            +
                                "author": "@thorstenMueller",
         | 
| 414 | 
            +
                                "license": "apache 2.0",
         | 
| 415 | 
            +
                                "commit": "unknown"
         | 
| 416 | 
            +
                            }
         | 
| 417 | 
            +
                        },
         | 
| 418 | 
            +
                        "css10": {
         | 
| 419 | 
            +
                            "vits-neon": {
         | 
| 420 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
         | 
| 421 | 
            +
                                "default_vocoder": null,
         | 
| 422 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 423 | 
            +
                                "license": "bsd-3-clause",
         | 
| 424 | 
            +
                                "commit": null
         | 
| 425 | 
            +
                            }
         | 
| 426 | 
            +
                        }
         | 
| 427 | 
            +
                    },
         | 
| 428 | 
            +
                    "ja": {
         | 
| 429 | 
            +
                        "kokoro": {
         | 
| 430 | 
            +
                            "tacotron2-DDC": {
         | 
| 431 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
         | 
| 432 | 
            +
                                "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
         | 
| 433 | 
            +
                                "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
         | 
| 434 | 
            +
                                "author": "@kaiidams",
         | 
| 435 | 
            +
                                "license": "apache 2.0",
         | 
| 436 | 
            +
                                "commit": "401fbd89"
         | 
| 437 | 
            +
                            }
         | 
| 438 | 
            +
                        }
         | 
| 439 | 
            +
                    },
         | 
| 440 | 
            +
                    "tr": {
         | 
| 441 | 
            +
                        "common-voice": {
         | 
| 442 | 
            +
                            "glow-tts": {
         | 
| 443 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
         | 
| 444 | 
            +
                                "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
         | 
| 445 | 
            +
                                "license": "MIT",
         | 
| 446 | 
            +
                                "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
         | 
| 447 | 
            +
                                "author": "Fatih Akademi",
         | 
| 448 | 
            +
                                "commit": null
         | 
| 449 | 
            +
                            }
         | 
| 450 | 
            +
                        }
         | 
| 451 | 
            +
                    },
         | 
| 452 | 
            +
                    "it": {
         | 
| 453 | 
            +
                        "mai_female": {
         | 
| 454 | 
            +
                            "glow-tts": {
         | 
| 455 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
         | 
| 456 | 
            +
                                "default_vocoder": null,
         | 
| 457 | 
            +
                                "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
         | 
| 458 | 
            +
                                "author": "@nicolalandro",
         | 
| 459 | 
            +
                                "license": "apache 2.0",
         | 
| 460 | 
            +
                                "commit": null
         | 
| 461 | 
            +
                            },
         | 
| 462 | 
            +
                            "vits": {
         | 
| 463 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
         | 
| 464 | 
            +
                                "default_vocoder": null,
         | 
| 465 | 
            +
                                "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
         | 
| 466 | 
            +
                                "author": "@nicolalandro",
         | 
| 467 | 
            +
                                "license": "apache 2.0",
         | 
| 468 | 
            +
                                "commit": null
         | 
| 469 | 
            +
                            }
         | 
| 470 | 
            +
                        },
         | 
| 471 | 
            +
                        "mai_male": {
         | 
| 472 | 
            +
                            "glow-tts": {
         | 
| 473 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
         | 
| 474 | 
            +
                                "default_vocoder": null,
         | 
| 475 | 
            +
                                "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
         | 
| 476 | 
            +
                                "author": "@nicolalandro",
         | 
| 477 | 
            +
                                "license": "apache 2.0",
         | 
| 478 | 
            +
                                "commit": null
         | 
| 479 | 
            +
                            },
         | 
| 480 | 
            +
                            "vits": {
         | 
| 481 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
         | 
| 482 | 
            +
                                "default_vocoder": null,
         | 
| 483 | 
            +
                                "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
         | 
| 484 | 
            +
                                "author": "@nicolalandro",
         | 
| 485 | 
            +
                                "license": "apache 2.0",
         | 
| 486 | 
            +
                                "commit": null
         | 
| 487 | 
            +
                            }
         | 
| 488 | 
            +
                        }
         | 
| 489 | 
            +
                    },
         | 
| 490 | 
            +
                    "ewe": {
         | 
| 491 | 
            +
                        "openbible": {
         | 
| 492 | 
            +
                            "vits": {
         | 
| 493 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
         | 
| 494 | 
            +
                                "default_vocoder": null,
         | 
| 495 | 
            +
                                "license": "CC-BY-SA 4.0",
         | 
| 496 | 
            +
                                "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
         | 
| 497 | 
            +
                                "author": "@coqui_ai",
         | 
| 498 | 
            +
                                "commit": "1b22f03"
         | 
| 499 | 
            +
                            }
         | 
| 500 | 
            +
                        }
         | 
| 501 | 
            +
                    },
         | 
| 502 | 
            +
                    "hau": {
         | 
| 503 | 
            +
                        "openbible": {
         | 
| 504 | 
            +
                            "vits": {
         | 
| 505 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
         | 
| 506 | 
            +
                                "default_vocoder": null,
         | 
| 507 | 
            +
                                "license": "CC-BY-SA 4.0",
         | 
| 508 | 
            +
                                "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
         | 
| 509 | 
            +
                                "author": "@coqui_ai",
         | 
| 510 | 
            +
                                "commit": "1b22f03"
         | 
| 511 | 
            +
                            }
         | 
| 512 | 
            +
                        }
         | 
| 513 | 
            +
                    },
         | 
| 514 | 
            +
                    "lin": {
         | 
| 515 | 
            +
                        "openbible": {
         | 
| 516 | 
            +
                            "vits": {
         | 
| 517 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
         | 
| 518 | 
            +
                                "default_vocoder": null,
         | 
| 519 | 
            +
                                "license": "CC-BY-SA 4.0",
         | 
| 520 | 
            +
                                "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
         | 
| 521 | 
            +
                                "author": "@coqui_ai",
         | 
| 522 | 
            +
                                "commit": "1b22f03"
         | 
| 523 | 
            +
                            }
         | 
| 524 | 
            +
                        }
         | 
| 525 | 
            +
                    },
         | 
| 526 | 
            +
                    "tw_akuapem": {
         | 
| 527 | 
            +
                        "openbible": {
         | 
| 528 | 
            +
                            "vits": {
         | 
| 529 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
         | 
| 530 | 
            +
                                "default_vocoder": null,
         | 
| 531 | 
            +
                                "license": "CC-BY-SA 4.0",
         | 
| 532 | 
            +
                                "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
         | 
| 533 | 
            +
                                "author": "@coqui_ai",
         | 
| 534 | 
            +
                                "commit": "1b22f03"
         | 
| 535 | 
            +
                            }
         | 
| 536 | 
            +
                        }
         | 
| 537 | 
            +
                    },
         | 
| 538 | 
            +
                    "tw_asante": {
         | 
| 539 | 
            +
                        "openbible": {
         | 
| 540 | 
            +
                            "vits": {
         | 
| 541 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
         | 
| 542 | 
            +
                                "default_vocoder": null,
         | 
| 543 | 
            +
                                "license": "CC-BY-SA 4.0",
         | 
| 544 | 
            +
                                "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
         | 
| 545 | 
            +
                                "author": "@coqui_ai",
         | 
| 546 | 
            +
                                "commit": "1b22f03"
         | 
| 547 | 
            +
                            }
         | 
| 548 | 
            +
                        }
         | 
| 549 | 
            +
                    },
         | 
| 550 | 
            +
                    "yor": {
         | 
| 551 | 
            +
                        "openbible": {
         | 
| 552 | 
            +
                            "vits": {
         | 
| 553 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
         | 
| 554 | 
            +
                                "default_vocoder": null,
         | 
| 555 | 
            +
                                "license": "CC-BY-SA 4.0",
         | 
| 556 | 
            +
                                "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
         | 
| 557 | 
            +
                                "author": "@coqui_ai",
         | 
| 558 | 
            +
                                "commit": "1b22f03"
         | 
| 559 | 
            +
                            }
         | 
| 560 | 
            +
                        }
         | 
| 561 | 
            +
                    },
         | 
| 562 | 
            +
                    "hu": {
         | 
| 563 | 
            +
                        "css10": {
         | 
| 564 | 
            +
                            "vits": {
         | 
| 565 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
         | 
| 566 | 
            +
                                "default_vocoder": null,
         | 
| 567 | 
            +
                                "commit": null,
         | 
| 568 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 569 | 
            +
                                "license": "bsd-3-clause"
         | 
| 570 | 
            +
                            }
         | 
| 571 | 
            +
                        }
         | 
| 572 | 
            +
                    },
         | 
| 573 | 
            +
                    "el": {
         | 
| 574 | 
            +
                        "cv": {
         | 
| 575 | 
            +
                            "vits": {
         | 
| 576 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
         | 
| 577 | 
            +
                                "default_vocoder": null,
         | 
| 578 | 
            +
                                "commit": null,
         | 
| 579 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 580 | 
            +
                                "license": "bsd-3-clause"
         | 
| 581 | 
            +
                            }
         | 
| 582 | 
            +
                        }
         | 
| 583 | 
            +
                    },
         | 
| 584 | 
            +
                    "fi": {
         | 
| 585 | 
            +
                        "css10": {
         | 
| 586 | 
            +
                            "vits": {
         | 
| 587 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
         | 
| 588 | 
            +
                                "default_vocoder": null,
         | 
| 589 | 
            +
                                "commit": null,
         | 
| 590 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 591 | 
            +
                                "license": "bsd-3-clause"
         | 
| 592 | 
            +
                            }
         | 
| 593 | 
            +
                        }
         | 
| 594 | 
            +
                    },
         | 
| 595 | 
            +
                    "hr": {
         | 
| 596 | 
            +
                        "cv": {
         | 
| 597 | 
            +
                            "vits": {
         | 
| 598 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
         | 
| 599 | 
            +
                                "default_vocoder": null,
         | 
| 600 | 
            +
                                "commit": null,
         | 
| 601 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 602 | 
            +
                                "license": "bsd-3-clause"
         | 
| 603 | 
            +
                            }
         | 
| 604 | 
            +
                        }
         | 
| 605 | 
            +
                    },
         | 
| 606 | 
            +
                    "lt": {
         | 
| 607 | 
            +
                        "cv": {
         | 
| 608 | 
            +
                            "vits": {
         | 
| 609 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
         | 
| 610 | 
            +
                                "default_vocoder": null,
         | 
| 611 | 
            +
                                "commit": null,
         | 
| 612 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 613 | 
            +
                                "license": "bsd-3-clause"
         | 
| 614 | 
            +
                            }
         | 
| 615 | 
            +
                        }
         | 
| 616 | 
            +
                    },
         | 
| 617 | 
            +
                    "lv": {
         | 
| 618 | 
            +
                        "cv": {
         | 
| 619 | 
            +
                            "vits": {
         | 
| 620 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
         | 
| 621 | 
            +
                                "default_vocoder": null,
         | 
| 622 | 
            +
                                "commit": null,
         | 
| 623 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 624 | 
            +
                                "license": "bsd-3-clause"
         | 
| 625 | 
            +
                            }
         | 
| 626 | 
            +
                        }
         | 
| 627 | 
            +
                    },
         | 
| 628 | 
            +
                    "mt": {
         | 
| 629 | 
            +
                        "cv": {
         | 
| 630 | 
            +
                            "vits": {
         | 
| 631 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
         | 
| 632 | 
            +
                                "default_vocoder": null,
         | 
| 633 | 
            +
                                "commit": null,
         | 
| 634 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 635 | 
            +
                                "license": "bsd-3-clause"
         | 
| 636 | 
            +
                            }
         | 
| 637 | 
            +
                        }
         | 
| 638 | 
            +
                    },
         | 
| 639 | 
            +
                    "pl": {
         | 
| 640 | 
            +
                        "mai_female": {
         | 
| 641 | 
            +
                            "vits": {
         | 
| 642 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
         | 
| 643 | 
            +
                                "default_vocoder": null,
         | 
| 644 | 
            +
                                "commit": null,
         | 
| 645 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 646 | 
            +
                                "license": "bsd-3-clause"
         | 
| 647 | 
            +
                            }
         | 
| 648 | 
            +
                        }
         | 
| 649 | 
            +
                    },
         | 
| 650 | 
            +
                    "pt": {
         | 
| 651 | 
            +
                        "cv": {
         | 
| 652 | 
            +
                            "vits": {
         | 
| 653 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
         | 
| 654 | 
            +
                                "default_vocoder": null,
         | 
| 655 | 
            +
                                "commit": null,
         | 
| 656 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 657 | 
            +
                                "license": "bsd-3-clause"
         | 
| 658 | 
            +
                            }
         | 
| 659 | 
            +
                        }
         | 
| 660 | 
            +
                    },
         | 
| 661 | 
            +
                    "ro": {
         | 
| 662 | 
            +
                        "cv": {
         | 
| 663 | 
            +
                            "vits": {
         | 
| 664 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
         | 
| 665 | 
            +
                                "default_vocoder": null,
         | 
| 666 | 
            +
                                "commit": null,
         | 
| 667 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 668 | 
            +
                                "license": "bsd-3-clause"
         | 
| 669 | 
            +
                            }
         | 
| 670 | 
            +
                        }
         | 
| 671 | 
            +
                    },
         | 
| 672 | 
            +
                    "sk": {
         | 
| 673 | 
            +
                        "cv": {
         | 
| 674 | 
            +
                            "vits": {
         | 
| 675 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
         | 
| 676 | 
            +
                                "default_vocoder": null,
         | 
| 677 | 
            +
                                "commit": null,
         | 
| 678 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 679 | 
            +
                                "license": "bsd-3-clause"
         | 
| 680 | 
            +
                            }
         | 
| 681 | 
            +
                        }
         | 
| 682 | 
            +
                    },
         | 
| 683 | 
            +
                    "sl": {
         | 
| 684 | 
            +
                        "cv": {
         | 
| 685 | 
            +
                            "vits": {
         | 
| 686 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
         | 
| 687 | 
            +
                                "default_vocoder": null,
         | 
| 688 | 
            +
                                "commit": null,
         | 
| 689 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 690 | 
            +
                                "license": "bsd-3-clause"
         | 
| 691 | 
            +
                            }
         | 
| 692 | 
            +
                        }
         | 
| 693 | 
            +
                    },
         | 
| 694 | 
            +
                    "sv": {
         | 
| 695 | 
            +
                        "cv": {
         | 
| 696 | 
            +
                            "vits": {
         | 
| 697 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
         | 
| 698 | 
            +
                                "default_vocoder": null,
         | 
| 699 | 
            +
                                "commit": null,
         | 
| 700 | 
            +
                                "author": "@NeonGeckoCom",
         | 
| 701 | 
            +
                                "license": "bsd-3-clause"
         | 
| 702 | 
            +
                            }
         | 
| 703 | 
            +
                        }
         | 
| 704 | 
            +
                    },
         | 
| 705 | 
            +
                    "ca": {
         | 
| 706 | 
            +
                        "custom": {
         | 
| 707 | 
            +
                            "vits": {
         | 
| 708 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
         | 
| 709 | 
            +
                                "default_vocoder": null,
         | 
| 710 | 
            +
                                "commit": null,
         | 
| 711 | 
            +
                                "description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
         | 
| 712 | 
            +
                                "author": "@gullabi",
         | 
| 713 | 
            +
                                "license": "CC-BY-4.0"
         | 
| 714 | 
            +
                            }
         | 
| 715 | 
            +
                        }
         | 
| 716 | 
            +
                    },
         | 
| 717 | 
            +
                    "fa": {
         | 
| 718 | 
            +
                        "custom": {
         | 
| 719 | 
            +
                            "glow-tts": {
         | 
| 720 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
         | 
| 721 | 
            +
                                "default_vocoder": null,
         | 
| 722 | 
            +
                                "commit": null,
         | 
| 723 | 
            +
                                "description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
         | 
| 724 | 
            +
                                "author": "@karim23657",
         | 
| 725 | 
            +
                                "license": "CC-BY-4.0"
         | 
| 726 | 
            +
                            }
         | 
| 727 | 
            +
                        }
         | 
| 728 | 
            +
                    },
         | 
| 729 | 
            +
                    "bn": {
         | 
| 730 | 
            +
                        "custom": {
         | 
| 731 | 
            +
                            "vits-male": {
         | 
| 732 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
         | 
| 733 | 
            +
                                "default_vocoder": null,
         | 
| 734 | 
            +
                                "commit": null,
         | 
| 735 | 
            +
                                "description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
         | 
| 736 | 
            +
                                "author": "@mobassir94",
         | 
| 737 | 
            +
                                "license": "Apache 2.0"
         | 
| 738 | 
            +
                            },
         | 
| 739 | 
            +
                            "vits-female": {
         | 
| 740 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
         | 
| 741 | 
            +
                                "default_vocoder": null,
         | 
| 742 | 
            +
                                "commit": null,
         | 
| 743 | 
            +
                                "description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
         | 
| 744 | 
            +
                                "author": "@mobassir94",
         | 
| 745 | 
            +
                                "license": "Apache 2.0"
         | 
| 746 | 
            +
                            }
         | 
| 747 | 
            +
                        }
         | 
| 748 | 
            +
                    },
         | 
| 749 | 
            +
                    "be": {
         | 
| 750 | 
            +
                        "common-voice": {
         | 
| 751 | 
            +
                            "glow-tts":{
         | 
| 752 | 
            +
                                "description": "Belarusian GlowTTS model created by @alex73 (Github).",
         | 
| 753 | 
            +
                                "github_rls_url":"https://coqui.gateway.scarf.sh/v0.16.6/tts_models--be--common-voice--glow-tts.zip",
         | 
| 754 | 
            +
                                "default_vocoder": "vocoder_models/be/common-voice/hifigan",
         | 
| 755 | 
            +
                                "commit": "c0aabb85",
         | 
| 756 | 
            +
                                "license": "CC-BY-SA 4.0",
         | 
| 757 | 
            +
                                "contact": "[email protected]"
         | 
| 758 | 
            +
                            }
         | 
| 759 | 
            +
                        }
         | 
| 760 | 
            +
                    }
         | 
| 761 | 
            +
                },
         | 
| 762 | 
            +
                "vocoder_models": {
         | 
| 763 | 
            +
                    "universal": {
         | 
| 764 | 
            +
                        "libri-tts": {
         | 
| 765 | 
            +
                            "wavegrad": {
         | 
| 766 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
         | 
| 767 | 
            +
                                "commit": "ea976b0",
         | 
| 768 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 769 | 
            +
                                "license": "MPL",
         | 
| 770 | 
            +
                                "contact": "[email protected]"
         | 
| 771 | 
            +
                            },
         | 
| 772 | 
            +
                            "fullband-melgan": {
         | 
| 773 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
         | 
| 774 | 
            +
                                "commit": "4132240",
         | 
| 775 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 776 | 
            +
                                "license": "MPL",
         | 
| 777 | 
            +
                                "contact": "[email protected]"
         | 
| 778 | 
            +
                            }
         | 
| 779 | 
            +
                        }
         | 
| 780 | 
            +
                    },
         | 
| 781 | 
            +
                    "en": {
         | 
| 782 | 
            +
                        "ek1": {
         | 
| 783 | 
            +
                            "wavegrad": {
         | 
| 784 | 
            +
                                "description": "EK1 en-rp wavegrad by NMStoker",
         | 
| 785 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
         | 
| 786 | 
            +
                                "commit": "c802255",
         | 
| 787 | 
            +
                                "license": "apache 2.0"
         | 
| 788 | 
            +
                            }
         | 
| 789 | 
            +
                        },
         | 
| 790 | 
            +
                        "ljspeech": {
         | 
| 791 | 
            +
                            "multiband-melgan": {
         | 
| 792 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
         | 
| 793 | 
            +
                                "commit": "ea976b0",
         | 
| 794 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 795 | 
            +
                                "license": "MPL",
         | 
| 796 | 
            +
                                "contact": "[email protected]"
         | 
| 797 | 
            +
                            },
         | 
| 798 | 
            +
                            "hifigan_v2": {
         | 
| 799 | 
            +
                                "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
         | 
| 800 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
         | 
| 801 | 
            +
                                "commit": "bae2ad0f",
         | 
| 802 | 
            +
                                "author": "@erogol",
         | 
| 803 | 
            +
                                "license": "apache 2.0",
         | 
| 804 | 
            +
                                "contact": "[email protected]"
         | 
| 805 | 
            +
                            },
         | 
| 806 | 
            +
                            "univnet": {
         | 
| 807 | 
            +
                                "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
         | 
| 808 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
         | 
| 809 | 
            +
                                "commit": "4581e3d",
         | 
| 810 | 
            +
                                "author": "Eren @erogol",
         | 
| 811 | 
            +
                                "license": "apache 2.0",
         | 
| 812 | 
            +
                                "contact": "[email protected]"
         | 
| 813 | 
            +
                            }
         | 
| 814 | 
            +
                        },
         | 
| 815 | 
            +
                        "blizzard2013": {
         | 
| 816 | 
            +
                            "hifigan_v2": {
         | 
| 817 | 
            +
                                "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
         | 
| 818 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
         | 
| 819 | 
            +
                                "commit": "d6284e7",
         | 
| 820 | 
            +
                                "author": "Adam Froghyar @a-froghyar",
         | 
| 821 | 
            +
                                "license": "apache 2.0",
         | 
| 822 | 
            +
                                "contact": "[email protected]"
         | 
| 823 | 
            +
                            }
         | 
| 824 | 
            +
                        },
         | 
| 825 | 
            +
                        "vctk": {
         | 
| 826 | 
            +
                            "hifigan_v2": {
         | 
| 827 | 
            +
                                "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
         | 
| 828 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
         | 
| 829 | 
            +
                                "commit": "2f07160",
         | 
| 830 | 
            +
                                "author": "Edresson Casanova",
         | 
| 831 | 
            +
                                "license": "apache 2.0",
         | 
| 832 | 
            +
                                "contact": ""
         | 
| 833 | 
            +
                            }
         | 
| 834 | 
            +
                        },
         | 
| 835 | 
            +
                        "sam": {
         | 
| 836 | 
            +
                            "hifigan_v2": {
         | 
| 837 | 
            +
                                "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
         | 
| 838 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
         | 
| 839 | 
            +
                                "commit": "2f07160",
         | 
| 840 | 
            +
                                "author": "Eren Gölge @erogol",
         | 
| 841 | 
            +
                                "license": "apache 2.0",
         | 
| 842 | 
            +
                                "contact": "[email protected]"
         | 
| 843 | 
            +
                            }
         | 
| 844 | 
            +
                        }
         | 
| 845 | 
            +
                    },
         | 
| 846 | 
            +
                    "nl": {
         | 
| 847 | 
            +
                        "mai": {
         | 
| 848 | 
            +
                            "parallel-wavegan": {
         | 
| 849 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
         | 
| 850 | 
            +
                                "author": "@r-dh",
         | 
| 851 | 
            +
                                "license": "apache 2.0",
         | 
| 852 | 
            +
                                "commit": "unknown"
         | 
| 853 | 
            +
                            }
         | 
| 854 | 
            +
                        }
         | 
| 855 | 
            +
                    },
         | 
| 856 | 
            +
                    "de": {
         | 
| 857 | 
            +
                        "thorsten": {
         | 
| 858 | 
            +
                            "wavegrad": {
         | 
| 859 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
         | 
| 860 | 
            +
                                "author": "@thorstenMueller",
         | 
| 861 | 
            +
                                "license": "apache 2.0",
         | 
| 862 | 
            +
                                "commit": "unknown"
         | 
| 863 | 
            +
                            },
         | 
| 864 | 
            +
                            "fullband-melgan": {
         | 
| 865 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
         | 
| 866 | 
            +
                                "author": "@thorstenMueller",
         | 
| 867 | 
            +
                                "license": "apache 2.0",
         | 
| 868 | 
            +
                                "commit": "unknown"
         | 
| 869 | 
            +
                            },
         | 
| 870 | 
            +
                            "hifigan_v1": {
         | 
| 871 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
         | 
| 872 | 
            +
                                "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
         | 
| 873 | 
            +
                                "author": "@thorstenMueller",
         | 
| 874 | 
            +
                                "license": "apache 2.0",
         | 
| 875 | 
            +
                                "commit": "unknown"
         | 
| 876 | 
            +
                            }
         | 
| 877 | 
            +
                        }
         | 
| 878 | 
            +
                    },
         | 
| 879 | 
            +
                    "ja": {
         | 
| 880 | 
            +
                        "kokoro": {
         | 
| 881 | 
            +
                            "hifigan_v1": {
         | 
| 882 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
         | 
| 883 | 
            +
                                "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
         | 
| 884 | 
            +
                                "author": "@kaiidams",
         | 
| 885 | 
            +
                                "license": "apache 2.0",
         | 
| 886 | 
            +
                                "commit": "3900448"
         | 
| 887 | 
            +
                            }
         | 
| 888 | 
            +
                        }
         | 
| 889 | 
            +
                    },
         | 
| 890 | 
            +
                    "uk": {
         | 
| 891 | 
            +
                        "mai": {
         | 
| 892 | 
            +
                            "multiband-melgan": {
         | 
| 893 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
         | 
| 894 | 
            +
                                "author": "@robinhad",
         | 
| 895 | 
            +
                                "commit": "bdab788d",
         | 
| 896 | 
            +
                                "license": "MIT",
         | 
| 897 | 
            +
                                "contact": ""
         | 
| 898 | 
            +
                            }
         | 
| 899 | 
            +
                        }
         | 
| 900 | 
            +
                    },
         | 
| 901 | 
            +
                    "tr": {
         | 
| 902 | 
            +
                        "common-voice": {
         | 
| 903 | 
            +
                            "hifigan": {
         | 
| 904 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
         | 
| 905 | 
            +
                                "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
         | 
| 906 | 
            +
                                "author": "Fatih Akademi",
         | 
| 907 | 
            +
                                "license": "MIT",
         | 
| 908 | 
            +
                                "commit": null
         | 
| 909 | 
            +
                            }
         | 
| 910 | 
            +
                        }
         | 
| 911 | 
            +
                    },
         | 
| 912 | 
            +
                    "be": {
         | 
| 913 | 
            +
                        "common-voice": {
         | 
| 914 | 
            +
                            "hifigan": {
         | 
| 915 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.16.6/vocoder_models--be--common-voice--hifigan.zip",
         | 
| 916 | 
            +
                                "description": "Belarusian HiFiGAN model created by @alex73 (Github).",
         | 
| 917 | 
            +
                                "author": "@alex73",
         | 
| 918 | 
            +
                                "license": "CC-BY-SA 4.0",
         | 
| 919 | 
            +
                                "commit": "c0aabb85"
         | 
| 920 | 
            +
                            }
         | 
| 921 | 
            +
                        }
         | 
| 922 | 
            +
                    }
         | 
| 923 | 
            +
                },
         | 
| 924 | 
            +
                "voice_conversion_models": {
         | 
| 925 | 
            +
                    "multilingual": {
         | 
| 926 | 
            +
                        "vctk": {
         | 
| 927 | 
            +
                            "freevc24": {
         | 
| 928 | 
            +
                                "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
         | 
| 929 | 
            +
                                "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
         | 
| 930 | 
            +
                                "author": "Jing-Yi Li @OlaWod",
         | 
| 931 | 
            +
                                "license": "MIT",
         | 
| 932 | 
            +
                                "commit": null
         | 
| 933 | 
            +
                            }
         | 
| 934 | 
            +
                        }
         | 
| 935 | 
            +
                    }
         | 
| 936 | 
            +
                }
         | 
| 937 | 
            +
            }
         | 
    	
        TTS/VERSION
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            0.20.6
         | 
    	
        TTS/__init__.py
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
         | 
| 4 | 
            +
                version = f.read().strip()
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            __version__ = version
         | 
    	
        TTS/__pycache__/__init__.cpython-39.pyc
    ADDED
    
    | Binary file (358 Bytes). View file | 
|  | 
    	
        TTS/__pycache__/api.cpython-39.pyc
    ADDED
    
    | Binary file (18.5 kB). View file | 
|  | 
    	
        TTS/__pycache__/cs_api.cpython-39.pyc
    ADDED
    
    | Binary file (12 kB). View file | 
|  | 
    	
        TTS/__pycache__/model.cpython-39.pyc
    ADDED
    
    | Binary file (2.58 kB). View file | 
|  | 
    	
        TTS/api.py
    ADDED
    
    | @@ -0,0 +1,489 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import tempfile
         | 
| 2 | 
            +
            import warnings
         | 
| 3 | 
            +
            from pathlib import Path
         | 
| 4 | 
            +
            from typing import Union
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            import numpy as np
         | 
| 7 | 
            +
            from torch import nn
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            from TTS.cs_api import CS_API
         | 
| 10 | 
            +
            from TTS.utils.audio.numpy_transforms import save_wav
         | 
| 11 | 
            +
            from TTS.utils.manage import ModelManager
         | 
| 12 | 
            +
            from TTS.utils.synthesizer import Synthesizer
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            class TTS(nn.Module):
         | 
| 16 | 
            +
                """TODO: Add voice conversion and Capacitron support."""
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                def __init__(
         | 
| 19 | 
            +
                    self,
         | 
| 20 | 
            +
                    model_name: str = "",
         | 
| 21 | 
            +
                    model_path: str = None,
         | 
| 22 | 
            +
                    config_path: str = None,
         | 
| 23 | 
            +
                    vocoder_path: str = None,
         | 
| 24 | 
            +
                    vocoder_config_path: str = None,
         | 
| 25 | 
            +
                    progress_bar: bool = True,
         | 
| 26 | 
            +
                    cs_api_model: str = "XTTS",
         | 
| 27 | 
            +
                    gpu=False,
         | 
| 28 | 
            +
                ):
         | 
| 29 | 
            +
                    """🐸TTS python interface that allows to load and use the released models.
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                    Example with a multi-speaker model:
         | 
| 32 | 
            +
                        >>> from TTS.api import TTS
         | 
| 33 | 
            +
                        >>> tts = TTS(TTS.list_models()[0])
         | 
| 34 | 
            +
                        >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
         | 
| 35 | 
            +
                        >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                    Example with a single-speaker model:
         | 
| 38 | 
            +
                        >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
         | 
| 39 | 
            +
                        >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                    Example loading a model from a path:
         | 
| 42 | 
            +
                        >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
         | 
| 43 | 
            +
                        >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                    Example voice cloning with YourTTS in English, French and Portuguese:
         | 
| 46 | 
            +
                        >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
         | 
| 47 | 
            +
                        >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
         | 
| 48 | 
            +
                        >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
         | 
| 49 | 
            +
                        >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                    Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
         | 
| 52 | 
            +
                        >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
         | 
| 53 | 
            +
                        >>> tts.tts_to_file("This is a test.", file_path="output.wav")
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                    Args:
         | 
| 56 | 
            +
                        model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
         | 
| 57 | 
            +
                        model_path (str, optional): Path to the model checkpoint. Defaults to None.
         | 
| 58 | 
            +
                        config_path (str, optional): Path to the model config. Defaults to None.
         | 
| 59 | 
            +
                        vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
         | 
| 60 | 
            +
                        vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
         | 
| 61 | 
            +
                        progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
         | 
| 62 | 
            +
                        cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
         | 
| 63 | 
            +
                            "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
         | 
| 64 | 
            +
                            Defaults to "XTTS".
         | 
| 65 | 
            +
                        gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         | 
| 66 | 
            +
                    """
         | 
| 67 | 
            +
                    super().__init__()
         | 
| 68 | 
            +
                    self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    self.synthesizer = None
         | 
| 71 | 
            +
                    self.voice_converter = None
         | 
| 72 | 
            +
                    self.csapi = None
         | 
| 73 | 
            +
                    self.cs_api_model = cs_api_model
         | 
| 74 | 
            +
                    self.model_name = ""
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                    if gpu:
         | 
| 77 | 
            +
                        warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                    if model_name is not None:
         | 
| 80 | 
            +
                        if "tts_models" in model_name or "coqui_studio" in model_name:
         | 
| 81 | 
            +
                            self.load_tts_model_by_name(model_name, gpu)
         | 
| 82 | 
            +
                        elif "voice_conversion_models" in model_name:
         | 
| 83 | 
            +
                            self.load_vc_model_by_name(model_name, gpu)
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                    if model_path:
         | 
| 86 | 
            +
                        self.load_tts_model_by_path(
         | 
| 87 | 
            +
                            model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
         | 
| 88 | 
            +
                        )
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                @property
         | 
| 91 | 
            +
                def models(self):
         | 
| 92 | 
            +
                    return self.manager.list_tts_models()
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                @property
         | 
| 95 | 
            +
                def is_multi_speaker(self):
         | 
| 96 | 
            +
                    if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
         | 
| 97 | 
            +
                        return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
         | 
| 98 | 
            +
                    return False
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                @property
         | 
| 101 | 
            +
                def is_coqui_studio(self):
         | 
| 102 | 
            +
                    if self.model_name is None:
         | 
| 103 | 
            +
                        return False
         | 
| 104 | 
            +
                    return "coqui_studio" in self.model_name
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                @property
         | 
| 107 | 
            +
                def is_multi_lingual(self):
         | 
| 108 | 
            +
                    # Not sure what sets this to None, but applied a fix to prevent crashing.
         | 
| 109 | 
            +
                    if isinstance(self.model_name, str) and "xtts" in self.model_name:
         | 
| 110 | 
            +
                        return True
         | 
| 111 | 
            +
                    if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
         | 
| 112 | 
            +
                        return self.synthesizer.tts_model.language_manager.num_languages > 1
         | 
| 113 | 
            +
                    return False
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                @property
         | 
| 116 | 
            +
                def speakers(self):
         | 
| 117 | 
            +
                    if not self.is_multi_speaker:
         | 
| 118 | 
            +
                        return None
         | 
| 119 | 
            +
                    return self.synthesizer.tts_model.speaker_manager.speaker_names
         | 
| 120 | 
            +
             | 
| 121 | 
            +
                @property
         | 
| 122 | 
            +
                def languages(self):
         | 
| 123 | 
            +
                    if not self.is_multi_lingual:
         | 
| 124 | 
            +
                        return None
         | 
| 125 | 
            +
                    return self.synthesizer.tts_model.language_manager.language_names
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                @staticmethod
         | 
| 128 | 
            +
                def get_models_file_path():
         | 
| 129 | 
            +
                    return Path(__file__).parent / ".models.json"
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                def list_models(self):
         | 
| 132 | 
            +
                    try:
         | 
| 133 | 
            +
                        csapi = CS_API(model=self.cs_api_model)
         | 
| 134 | 
            +
                        models = csapi.list_speakers_as_tts_models()
         | 
| 135 | 
            +
                    except ValueError as e:
         | 
| 136 | 
            +
                        print(e)
         | 
| 137 | 
            +
                        models = []
         | 
| 138 | 
            +
                    manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
         | 
| 139 | 
            +
                    return manager.list_tts_models() + models
         | 
| 140 | 
            +
             | 
| 141 | 
            +
                def download_model_by_name(self, model_name: str):
         | 
| 142 | 
            +
                    model_path, config_path, model_item = self.manager.download_model(model_name)
         | 
| 143 | 
            +
                    if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
         | 
| 144 | 
            +
                        # return model directory if there are multiple files
         | 
| 145 | 
            +
                        # we assume that the model knows how to load itself
         | 
| 146 | 
            +
                        return None, None, None, None, model_path
         | 
| 147 | 
            +
                    if model_item.get("default_vocoder") is None:
         | 
| 148 | 
            +
                        return model_path, config_path, None, None, None
         | 
| 149 | 
            +
                    vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
         | 
| 150 | 
            +
                    return model_path, config_path, vocoder_path, vocoder_config_path, None
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
         | 
| 153 | 
            +
                    """Load one of the voice conversion models by name.
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                    Args:
         | 
| 156 | 
            +
                        model_name (str): Model name to load. You can list models by ```tts.models```.
         | 
| 157 | 
            +
                        gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         | 
| 158 | 
            +
                    """
         | 
| 159 | 
            +
                    self.model_name = model_name
         | 
| 160 | 
            +
                    model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
         | 
| 161 | 
            +
                    self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
         | 
| 162 | 
            +
             | 
| 163 | 
            +
                def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
         | 
| 164 | 
            +
                    """Load one of 🐸TTS models by name.
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                    Args:
         | 
| 167 | 
            +
                        model_name (str): Model name to load. You can list models by ```tts.models```.
         | 
| 168 | 
            +
                        gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         | 
| 169 | 
            +
             | 
| 170 | 
            +
                    TODO: Add tests
         | 
| 171 | 
            +
                    """
         | 
| 172 | 
            +
                    self.synthesizer = None
         | 
| 173 | 
            +
                    self.csapi = None
         | 
| 174 | 
            +
                    self.model_name = model_name
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                    if "coqui_studio" in model_name:
         | 
| 177 | 
            +
                        self.csapi = CS_API()
         | 
| 178 | 
            +
                    else:
         | 
| 179 | 
            +
                        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
         | 
| 180 | 
            +
                            model_name
         | 
| 181 | 
            +
                        )
         | 
| 182 | 
            +
             | 
| 183 | 
            +
                        # init synthesizer
         | 
| 184 | 
            +
                        # None values are fetch from the model
         | 
| 185 | 
            +
                        self.synthesizer = Synthesizer(
         | 
| 186 | 
            +
                            tts_checkpoint=model_path,
         | 
| 187 | 
            +
                            tts_config_path=config_path,
         | 
| 188 | 
            +
                            tts_speakers_file=None,
         | 
| 189 | 
            +
                            tts_languages_file=None,
         | 
| 190 | 
            +
                            vocoder_checkpoint=vocoder_path,
         | 
| 191 | 
            +
                            vocoder_config=vocoder_config_path,
         | 
| 192 | 
            +
                            encoder_checkpoint=None,
         | 
| 193 | 
            +
                            encoder_config=None,
         | 
| 194 | 
            +
                            model_dir=model_dir,
         | 
| 195 | 
            +
                            use_cuda=gpu,
         | 
| 196 | 
            +
                        )
         | 
| 197 | 
            +
             | 
| 198 | 
            +
                def load_tts_model_by_path(
         | 
| 199 | 
            +
                    self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
         | 
| 200 | 
            +
                ):
         | 
| 201 | 
            +
                    """Load a model from a path.
         | 
| 202 | 
            +
             | 
| 203 | 
            +
                    Args:
         | 
| 204 | 
            +
                        model_path (str): Path to the model checkpoint.
         | 
| 205 | 
            +
                        config_path (str): Path to the model config.
         | 
| 206 | 
            +
                        vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
         | 
| 207 | 
            +
                        vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
         | 
| 208 | 
            +
                        gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         | 
| 209 | 
            +
                    """
         | 
| 210 | 
            +
             | 
| 211 | 
            +
                    self.synthesizer = Synthesizer(
         | 
| 212 | 
            +
                        tts_checkpoint=model_path,
         | 
| 213 | 
            +
                        tts_config_path=config_path,
         | 
| 214 | 
            +
                        tts_speakers_file=None,
         | 
| 215 | 
            +
                        tts_languages_file=None,
         | 
| 216 | 
            +
                        vocoder_checkpoint=vocoder_path,
         | 
| 217 | 
            +
                        vocoder_config=vocoder_config,
         | 
| 218 | 
            +
                        encoder_checkpoint=None,
         | 
| 219 | 
            +
                        encoder_config=None,
         | 
| 220 | 
            +
                        use_cuda=gpu,
         | 
| 221 | 
            +
                    )
         | 
| 222 | 
            +
             | 
| 223 | 
            +
                def _check_arguments(
         | 
| 224 | 
            +
                    self,
         | 
| 225 | 
            +
                    speaker: str = None,
         | 
| 226 | 
            +
                    language: str = None,
         | 
| 227 | 
            +
                    speaker_wav: str = None,
         | 
| 228 | 
            +
                    emotion: str = None,
         | 
| 229 | 
            +
                    speed: float = None,
         | 
| 230 | 
            +
                    **kwargs,
         | 
| 231 | 
            +
                ) -> None:
         | 
| 232 | 
            +
                    """Check if the arguments are valid for the model."""
         | 
| 233 | 
            +
                    if not self.is_coqui_studio:
         | 
| 234 | 
            +
                        # check for the coqui tts models
         | 
| 235 | 
            +
                        if self.is_multi_speaker and (speaker is None and speaker_wav is None):
         | 
| 236 | 
            +
                            raise ValueError("Model is multi-speaker but no `speaker` is provided.")
         | 
| 237 | 
            +
                        if self.is_multi_lingual and language is None:
         | 
| 238 | 
            +
                            raise ValueError("Model is multi-lingual but no `language` is provided.")
         | 
| 239 | 
            +
                        if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
         | 
| 240 | 
            +
                            raise ValueError("Model is not multi-speaker but `speaker` is provided.")
         | 
| 241 | 
            +
                        if not self.is_multi_lingual and language is not None:
         | 
| 242 | 
            +
                            raise ValueError("Model is not multi-lingual but `language` is provided.")
         | 
| 243 | 
            +
                        if not emotion is None and not speed is None:
         | 
| 244 | 
            +
                            raise ValueError("Emotion and speed can only be used with Coqui Studio models.")
         | 
| 245 | 
            +
                    else:
         | 
| 246 | 
            +
                        if emotion is None:
         | 
| 247 | 
            +
                            emotion = "Neutral"
         | 
| 248 | 
            +
                        if speed is None:
         | 
| 249 | 
            +
                            speed = 1.0
         | 
| 250 | 
            +
                        # check for the studio models
         | 
| 251 | 
            +
                        if speaker_wav is not None:
         | 
| 252 | 
            +
                            raise ValueError("Coqui Studio models do not support `speaker_wav` argument.")
         | 
| 253 | 
            +
                        if speaker is not None:
         | 
| 254 | 
            +
                            raise ValueError("Coqui Studio models do not support `speaker` argument.")
         | 
| 255 | 
            +
                        if language is not None and language != "en":
         | 
| 256 | 
            +
                            raise ValueError("Coqui Studio models currently support only `language=en` argument.")
         | 
| 257 | 
            +
                        if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]:
         | 
| 258 | 
            +
                            raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.")
         | 
| 259 | 
            +
             | 
| 260 | 
            +
                def tts_coqui_studio(
         | 
| 261 | 
            +
                    self,
         | 
| 262 | 
            +
                    text: str,
         | 
| 263 | 
            +
                    speaker_name: str = None,
         | 
| 264 | 
            +
                    language: str = None,
         | 
| 265 | 
            +
                    emotion: str = None,
         | 
| 266 | 
            +
                    speed: float = 1.0,
         | 
| 267 | 
            +
                    pipe_out=None,
         | 
| 268 | 
            +
                    file_path: str = None,
         | 
| 269 | 
            +
                ) -> Union[np.ndarray, str]:
         | 
| 270 | 
            +
                    """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
         | 
| 271 | 
            +
             | 
| 272 | 
            +
                    Args:
         | 
| 273 | 
            +
                        text (str):
         | 
| 274 | 
            +
                            Input text to synthesize.
         | 
| 275 | 
            +
                        speaker_name (str, optional):
         | 
| 276 | 
            +
                            Speaker name from Coqui Studio. Defaults to None.
         | 
| 277 | 
            +
                        language (str): Language of the text. If None, the default language of the speaker is used. Language is only
         | 
| 278 | 
            +
                            supported by `XTTS` model.
         | 
| 279 | 
            +
                        emotion (str, optional):
         | 
| 280 | 
            +
                            Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
         | 
| 281 | 
            +
                            with "V1" model. Defaults to None.
         | 
| 282 | 
            +
                        speed (float, optional):
         | 
| 283 | 
            +
                            Speed of the speech. Defaults to 1.0.
         | 
| 284 | 
            +
                        pipe_out (BytesIO, optional):
         | 
| 285 | 
            +
                            Flag to stdout the generated TTS wav file for shell pipe.
         | 
| 286 | 
            +
                        file_path (str, optional):
         | 
| 287 | 
            +
                            Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
         | 
| 288 | 
            +
             | 
| 289 | 
            +
                    Returns:
         | 
| 290 | 
            +
                        Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
         | 
| 291 | 
            +
                    """
         | 
| 292 | 
            +
                    speaker_name = self.model_name.split("/")[2]
         | 
| 293 | 
            +
                    if file_path is not None:
         | 
| 294 | 
            +
                        return self.csapi.tts_to_file(
         | 
| 295 | 
            +
                            text=text,
         | 
| 296 | 
            +
                            speaker_name=speaker_name,
         | 
| 297 | 
            +
                            language=language,
         | 
| 298 | 
            +
                            speed=speed,
         | 
| 299 | 
            +
                            pipe_out=pipe_out,
         | 
| 300 | 
            +
                            emotion=emotion,
         | 
| 301 | 
            +
                            file_path=file_path,
         | 
| 302 | 
            +
                        )[0]
         | 
| 303 | 
            +
                    return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0]
         | 
| 304 | 
            +
             | 
| 305 | 
            +
                def tts(
         | 
| 306 | 
            +
                    self,
         | 
| 307 | 
            +
                    text: str,
         | 
| 308 | 
            +
                    speaker: str = None,
         | 
| 309 | 
            +
                    language: str = None,
         | 
| 310 | 
            +
                    speaker_wav: str = None,
         | 
| 311 | 
            +
                    emotion: str = None,
         | 
| 312 | 
            +
                    speed: float = None,
         | 
| 313 | 
            +
                    **kwargs,
         | 
| 314 | 
            +
                ):
         | 
| 315 | 
            +
                    """Convert text to speech.
         | 
| 316 | 
            +
             | 
| 317 | 
            +
                    Args:
         | 
| 318 | 
            +
                        text (str):
         | 
| 319 | 
            +
                            Input text to synthesize.
         | 
| 320 | 
            +
                        speaker (str, optional):
         | 
| 321 | 
            +
                            Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
         | 
| 322 | 
            +
                            `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
         | 
| 323 | 
            +
                        language (str): Language of the text. If None, the default language of the speaker is used. Language is only
         | 
| 324 | 
            +
                            supported by `XTTS` model.
         | 
| 325 | 
            +
                        speaker_wav (str, optional):
         | 
| 326 | 
            +
                            Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
         | 
| 327 | 
            +
                            Defaults to None.
         | 
| 328 | 
            +
                        emotion (str, optional):
         | 
| 329 | 
            +
                            Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
         | 
| 330 | 
            +
                        speed (float, optional):
         | 
| 331 | 
            +
                            Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
         | 
| 332 | 
            +
                            Defaults to None.
         | 
| 333 | 
            +
                    """
         | 
| 334 | 
            +
                    self._check_arguments(
         | 
| 335 | 
            +
                        speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
         | 
| 336 | 
            +
                    )
         | 
| 337 | 
            +
                    if self.csapi is not None:
         | 
| 338 | 
            +
                        return self.tts_coqui_studio(
         | 
| 339 | 
            +
                            text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
         | 
| 340 | 
            +
                        )
         | 
| 341 | 
            +
                    wav = self.synthesizer.tts(
         | 
| 342 | 
            +
                        text=text,
         | 
| 343 | 
            +
                        speaker_name=speaker,
         | 
| 344 | 
            +
                        language_name=language,
         | 
| 345 | 
            +
                        speaker_wav=speaker_wav,
         | 
| 346 | 
            +
                        reference_wav=None,
         | 
| 347 | 
            +
                        style_wav=None,
         | 
| 348 | 
            +
                        style_text=None,
         | 
| 349 | 
            +
                        reference_speaker_name=None,
         | 
| 350 | 
            +
                        **kwargs,
         | 
| 351 | 
            +
                    )
         | 
| 352 | 
            +
                    return wav
         | 
| 353 | 
            +
             | 
| 354 | 
            +
                def tts_to_file(
         | 
| 355 | 
            +
                    self,
         | 
| 356 | 
            +
                    text: str,
         | 
| 357 | 
            +
                    speaker: str = None,
         | 
| 358 | 
            +
                    language: str = None,
         | 
| 359 | 
            +
                    speaker_wav: str = None,
         | 
| 360 | 
            +
                    emotion: str = None,
         | 
| 361 | 
            +
                    speed: float = 1.0,
         | 
| 362 | 
            +
                    pipe_out=None,
         | 
| 363 | 
            +
                    file_path: str = "output.wav",
         | 
| 364 | 
            +
                    **kwargs,
         | 
| 365 | 
            +
                ):
         | 
| 366 | 
            +
                    """Convert text to speech.
         | 
| 367 | 
            +
             | 
| 368 | 
            +
                    Args:
         | 
| 369 | 
            +
                        text (str):
         | 
| 370 | 
            +
                            Input text to synthesize.
         | 
| 371 | 
            +
                        speaker (str, optional):
         | 
| 372 | 
            +
                            Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
         | 
| 373 | 
            +
                            `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
         | 
| 374 | 
            +
                        language (str, optional):
         | 
| 375 | 
            +
                            Language code for multi-lingual models. You can check whether loaded model is multi-lingual
         | 
| 376 | 
            +
                            `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
         | 
| 377 | 
            +
                        speaker_wav (str, optional):
         | 
| 378 | 
            +
                            Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
         | 
| 379 | 
            +
                            Defaults to None.
         | 
| 380 | 
            +
                        emotion (str, optional):
         | 
| 381 | 
            +
                            Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
         | 
| 382 | 
            +
                        speed (float, optional):
         | 
| 383 | 
            +
                            Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
         | 
| 384 | 
            +
                        pipe_out (BytesIO, optional):
         | 
| 385 | 
            +
                            Flag to stdout the generated TTS wav file for shell pipe.
         | 
| 386 | 
            +
                        file_path (str, optional):
         | 
| 387 | 
            +
                            Output file path. Defaults to "output.wav".
         | 
| 388 | 
            +
                        kwargs (dict, optional):
         | 
| 389 | 
            +
                            Additional arguments for the model.
         | 
| 390 | 
            +
                    """
         | 
| 391 | 
            +
                    self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
         | 
| 392 | 
            +
             | 
| 393 | 
            +
                    if self.csapi is not None:
         | 
| 394 | 
            +
                        return self.tts_coqui_studio(
         | 
| 395 | 
            +
                            text=text,
         | 
| 396 | 
            +
                            speaker_name=speaker,
         | 
| 397 | 
            +
                            language=language,
         | 
| 398 | 
            +
                            emotion=emotion,
         | 
| 399 | 
            +
                            speed=speed,
         | 
| 400 | 
            +
                            file_path=file_path,
         | 
| 401 | 
            +
                            pipe_out=pipe_out,
         | 
| 402 | 
            +
                        )
         | 
| 403 | 
            +
                    wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
         | 
| 404 | 
            +
                    self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
         | 
| 405 | 
            +
                    return file_path
         | 
| 406 | 
            +
             | 
| 407 | 
            +
                def voice_conversion(
         | 
| 408 | 
            +
                    self,
         | 
| 409 | 
            +
                    source_wav: str,
         | 
| 410 | 
            +
                    target_wav: str,
         | 
| 411 | 
            +
                ):
         | 
| 412 | 
            +
                    """Voice conversion with FreeVC. Convert source wav to target speaker.
         | 
| 413 | 
            +
             | 
| 414 | 
            +
                    Args:``
         | 
| 415 | 
            +
                        source_wav (str):
         | 
| 416 | 
            +
                            Path to the source wav file.
         | 
| 417 | 
            +
                        target_wav (str):`
         | 
| 418 | 
            +
                            Path to the target wav file.
         | 
| 419 | 
            +
                    """
         | 
| 420 | 
            +
                    wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
         | 
| 421 | 
            +
                    return wav
         | 
| 422 | 
            +
             | 
| 423 | 
            +
                def voice_conversion_to_file(
         | 
| 424 | 
            +
                    self,
         | 
| 425 | 
            +
                    source_wav: str,
         | 
| 426 | 
            +
                    target_wav: str,
         | 
| 427 | 
            +
                    file_path: str = "output.wav",
         | 
| 428 | 
            +
                ):
         | 
| 429 | 
            +
                    """Voice conversion with FreeVC. Convert source wav to target speaker.
         | 
| 430 | 
            +
             | 
| 431 | 
            +
                    Args:
         | 
| 432 | 
            +
                        source_wav (str):
         | 
| 433 | 
            +
                            Path to the source wav file.
         | 
| 434 | 
            +
                        target_wav (str):
         | 
| 435 | 
            +
                            Path to the target wav file.
         | 
| 436 | 
            +
                        file_path (str, optional):
         | 
| 437 | 
            +
                            Output file path. Defaults to "output.wav".
         | 
| 438 | 
            +
                    """
         | 
| 439 | 
            +
                    wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
         | 
| 440 | 
            +
                    save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
         | 
| 441 | 
            +
                    return file_path
         | 
| 442 | 
            +
             | 
| 443 | 
            +
                def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None):
         | 
| 444 | 
            +
                    """Convert text to speech with voice conversion.
         | 
| 445 | 
            +
             | 
| 446 | 
            +
                    It combines tts with voice conversion to fake voice cloning.
         | 
| 447 | 
            +
             | 
| 448 | 
            +
                    - Convert text to speech with tts.
         | 
| 449 | 
            +
                    - Convert the output wav to target speaker with voice conversion.
         | 
| 450 | 
            +
             | 
| 451 | 
            +
                    Args:
         | 
| 452 | 
            +
                        text (str):
         | 
| 453 | 
            +
                            Input text to synthesize.
         | 
| 454 | 
            +
                        language (str, optional):
         | 
| 455 | 
            +
                            Language code for multi-lingual models. You can check whether loaded model is multi-lingual
         | 
| 456 | 
            +
                            `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
         | 
| 457 | 
            +
                        speaker_wav (str, optional):
         | 
| 458 | 
            +
                            Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
         | 
| 459 | 
            +
                            Defaults to None.
         | 
| 460 | 
            +
                    """
         | 
| 461 | 
            +
                    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
         | 
| 462 | 
            +
                        # Lazy code... save it to a temp file to resample it while reading it for VC
         | 
| 463 | 
            +
                        self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name, speaker_wav=speaker_wav)
         | 
| 464 | 
            +
                    if self.voice_converter is None:
         | 
| 465 | 
            +
                        self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
         | 
| 466 | 
            +
                    wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
         | 
| 467 | 
            +
                    return wav
         | 
| 468 | 
            +
             | 
| 469 | 
            +
                def tts_with_vc_to_file(
         | 
| 470 | 
            +
                    self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav"
         | 
| 471 | 
            +
                ):
         | 
| 472 | 
            +
                    """Convert text to speech with voice conversion and save to file.
         | 
| 473 | 
            +
             | 
| 474 | 
            +
                    Check `tts_with_vc` for more details.
         | 
| 475 | 
            +
             | 
| 476 | 
            +
                    Args:
         | 
| 477 | 
            +
                        text (str):
         | 
| 478 | 
            +
                            Input text to synthesize.
         | 
| 479 | 
            +
                        language (str, optional):
         | 
| 480 | 
            +
                            Language code for multi-lingual models. You can check whether loaded model is multi-lingual
         | 
| 481 | 
            +
                            `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
         | 
| 482 | 
            +
                        speaker_wav (str, optional):
         | 
| 483 | 
            +
                            Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
         | 
| 484 | 
            +
                            Defaults to None.
         | 
| 485 | 
            +
                        file_path (str, optional):
         | 
| 486 | 
            +
                            Output file path. Defaults to "output.wav".
         | 
| 487 | 
            +
                    """
         | 
| 488 | 
            +
                    wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav)
         | 
| 489 | 
            +
                    save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
         | 
    	
        TTS/bin/__init__.py
    ADDED
    
    | 
            File without changes
         | 
    	
        TTS/bin/collect_env_info.py
    ADDED
    
    | @@ -0,0 +1,48 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """Get detailed info about the working environment."""
         | 
| 2 | 
            +
            import os
         | 
| 3 | 
            +
            import platform
         | 
| 4 | 
            +
            import sys
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            import numpy
         | 
| 7 | 
            +
            import torch
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            sys.path += [os.path.abspath(".."), os.path.abspath(".")]
         | 
| 10 | 
            +
            import json
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            import TTS
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            def system_info():
         | 
| 16 | 
            +
                return {
         | 
| 17 | 
            +
                    "OS": platform.system(),
         | 
| 18 | 
            +
                    "architecture": platform.architecture(),
         | 
| 19 | 
            +
                    "version": platform.version(),
         | 
| 20 | 
            +
                    "processor": platform.processor(),
         | 
| 21 | 
            +
                    "python": platform.python_version(),
         | 
| 22 | 
            +
                }
         | 
| 23 | 
            +
             | 
| 24 | 
            +
             | 
| 25 | 
            +
            def cuda_info():
         | 
| 26 | 
            +
                return {
         | 
| 27 | 
            +
                    "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
         | 
| 28 | 
            +
                    "available": torch.cuda.is_available(),
         | 
| 29 | 
            +
                    "version": torch.version.cuda,
         | 
| 30 | 
            +
                }
         | 
| 31 | 
            +
             | 
| 32 | 
            +
             | 
| 33 | 
            +
            def package_info():
         | 
| 34 | 
            +
                return {
         | 
| 35 | 
            +
                    "numpy": numpy.__version__,
         | 
| 36 | 
            +
                    "PyTorch_version": torch.__version__,
         | 
| 37 | 
            +
                    "PyTorch_debug": torch.version.debug,
         | 
| 38 | 
            +
                    "TTS": TTS.__version__,
         | 
| 39 | 
            +
                }
         | 
| 40 | 
            +
             | 
| 41 | 
            +
             | 
| 42 | 
            +
            def main():
         | 
| 43 | 
            +
                details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
         | 
| 44 | 
            +
                print(json.dumps(details, indent=4, sort_keys=True))
         | 
| 45 | 
            +
             | 
| 46 | 
            +
             | 
| 47 | 
            +
            if __name__ == "__main__":
         | 
| 48 | 
            +
                main()
         | 
    	
        TTS/bin/compute_attention_masks.py
    ADDED
    
    | @@ -0,0 +1,165 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import argparse
         | 
| 2 | 
            +
            import importlib
         | 
| 3 | 
            +
            import os
         | 
| 4 | 
            +
            from argparse import RawTextHelpFormatter
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            import numpy as np
         | 
| 7 | 
            +
            import torch
         | 
| 8 | 
            +
            from torch.utils.data import DataLoader
         | 
| 9 | 
            +
            from tqdm import tqdm
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            from TTS.config import load_config
         | 
| 12 | 
            +
            from TTS.tts.datasets.TTSDataset import TTSDataset
         | 
| 13 | 
            +
            from TTS.tts.models import setup_model
         | 
| 14 | 
            +
            from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
         | 
| 15 | 
            +
            from TTS.utils.audio import AudioProcessor
         | 
| 16 | 
            +
            from TTS.utils.io import load_checkpoint
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            if __name__ == "__main__":
         | 
| 19 | 
            +
                # pylint: disable=bad-option-value
         | 
| 20 | 
            +
                parser = argparse.ArgumentParser(
         | 
| 21 | 
            +
                    description="""Extract attention masks from trained Tacotron/Tacotron2 models.
         | 
| 22 | 
            +
            These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
         | 
| 23 | 
            +
                    """Each attention mask is written to the same path as the input wav file with ".npy" file extension.
         | 
| 24 | 
            +
            (e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
         | 
| 25 | 
            +
                    """
         | 
| 26 | 
            +
            Example run:
         | 
| 27 | 
            +
                CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
         | 
| 28 | 
            +
                    --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
         | 
| 29 | 
            +
                    --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
         | 
| 30 | 
            +
                    --dataset_metafile metadata.csv
         | 
| 31 | 
            +
                    --data_path /root/LJSpeech-1.1/
         | 
| 32 | 
            +
                    --batch_size 32
         | 
| 33 | 
            +
                    --dataset ljspeech
         | 
| 34 | 
            +
                    --use_cuda True
         | 
| 35 | 
            +
            """,
         | 
| 36 | 
            +
                    formatter_class=RawTextHelpFormatter,
         | 
| 37 | 
            +
                )
         | 
| 38 | 
            +
                parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
         | 
| 39 | 
            +
                parser.add_argument(
         | 
| 40 | 
            +
                    "--config_path",
         | 
| 41 | 
            +
                    type=str,
         | 
| 42 | 
            +
                    required=True,
         | 
| 43 | 
            +
                    help="Path to Tacotron/Tacotron2 config file.",
         | 
| 44 | 
            +
                )
         | 
| 45 | 
            +
                parser.add_argument(
         | 
| 46 | 
            +
                    "--dataset",
         | 
| 47 | 
            +
                    type=str,
         | 
| 48 | 
            +
                    default="",
         | 
| 49 | 
            +
                    required=True,
         | 
| 50 | 
            +
                    help="Target dataset processor name from TTS.tts.dataset.preprocess.",
         | 
| 51 | 
            +
                )
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                parser.add_argument(
         | 
| 54 | 
            +
                    "--dataset_metafile",
         | 
| 55 | 
            +
                    type=str,
         | 
| 56 | 
            +
                    default="",
         | 
| 57 | 
            +
                    required=True,
         | 
| 58 | 
            +
                    help="Dataset metafile inclusing file paths with transcripts.",
         | 
| 59 | 
            +
                )
         | 
| 60 | 
            +
                parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
         | 
| 61 | 
            +
                parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                parser.add_argument(
         | 
| 64 | 
            +
                    "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
         | 
| 65 | 
            +
                )
         | 
| 66 | 
            +
                args = parser.parse_args()
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                C = load_config(args.config_path)
         | 
| 69 | 
            +
                ap = AudioProcessor(**C.audio)
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                # if the vocabulary was passed, replace the default
         | 
| 72 | 
            +
                if "characters" in C.keys():
         | 
| 73 | 
            +
                    symbols, phonemes = make_symbols(**C.characters)
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                # load the model
         | 
| 76 | 
            +
                num_chars = len(phonemes) if C.use_phonemes else len(symbols)
         | 
| 77 | 
            +
                # TODO: handle multi-speaker
         | 
| 78 | 
            +
                model = setup_model(C)
         | 
| 79 | 
            +
                model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                # data loader
         | 
| 82 | 
            +
                preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
         | 
| 83 | 
            +
                preprocessor = getattr(preprocessor, args.dataset)
         | 
| 84 | 
            +
                meta_data = preprocessor(args.data_path, args.dataset_metafile)
         | 
| 85 | 
            +
                dataset = TTSDataset(
         | 
| 86 | 
            +
                    model.decoder.r,
         | 
| 87 | 
            +
                    C.text_cleaner,
         | 
| 88 | 
            +
                    compute_linear_spec=False,
         | 
| 89 | 
            +
                    ap=ap,
         | 
| 90 | 
            +
                    meta_data=meta_data,
         | 
| 91 | 
            +
                    characters=C.characters if "characters" in C.keys() else None,
         | 
| 92 | 
            +
                    add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
         | 
| 93 | 
            +
                    use_phonemes=C.use_phonemes,
         | 
| 94 | 
            +
                    phoneme_cache_path=C.phoneme_cache_path,
         | 
| 95 | 
            +
                    phoneme_language=C.phoneme_language,
         | 
| 96 | 
            +
                    enable_eos_bos=C.enable_eos_bos_chars,
         | 
| 97 | 
            +
                )
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
         | 
| 100 | 
            +
                loader = DataLoader(
         | 
| 101 | 
            +
                    dataset,
         | 
| 102 | 
            +
                    batch_size=args.batch_size,
         | 
| 103 | 
            +
                    num_workers=4,
         | 
| 104 | 
            +
                    collate_fn=dataset.collate_fn,
         | 
| 105 | 
            +
                    shuffle=False,
         | 
| 106 | 
            +
                    drop_last=False,
         | 
| 107 | 
            +
                )
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                # compute attentions
         | 
| 110 | 
            +
                file_paths = []
         | 
| 111 | 
            +
                with torch.no_grad():
         | 
| 112 | 
            +
                    for data in tqdm(loader):
         | 
| 113 | 
            +
                        # setup input data
         | 
| 114 | 
            +
                        text_input = data[0]
         | 
| 115 | 
            +
                        text_lengths = data[1]
         | 
| 116 | 
            +
                        linear_input = data[3]
         | 
| 117 | 
            +
                        mel_input = data[4]
         | 
| 118 | 
            +
                        mel_lengths = data[5]
         | 
| 119 | 
            +
                        stop_targets = data[6]
         | 
| 120 | 
            +
                        item_idxs = data[7]
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                        # dispatch data to GPU
         | 
| 123 | 
            +
                        if args.use_cuda:
         | 
| 124 | 
            +
                            text_input = text_input.cuda()
         | 
| 125 | 
            +
                            text_lengths = text_lengths.cuda()
         | 
| 126 | 
            +
                            mel_input = mel_input.cuda()
         | 
| 127 | 
            +
                            mel_lengths = mel_lengths.cuda()
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                        model_outputs = model.forward(text_input, text_lengths, mel_input)
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                        alignments = model_outputs["alignments"].detach()
         | 
| 132 | 
            +
                        for idx, alignment in enumerate(alignments):
         | 
| 133 | 
            +
                            item_idx = item_idxs[idx]
         | 
| 134 | 
            +
                            # interpolate if r > 1
         | 
| 135 | 
            +
                            alignment = (
         | 
| 136 | 
            +
                                torch.nn.functional.interpolate(
         | 
| 137 | 
            +
                                    alignment.transpose(0, 1).unsqueeze(0),
         | 
| 138 | 
            +
                                    size=None,
         | 
| 139 | 
            +
                                    scale_factor=model.decoder.r,
         | 
| 140 | 
            +
                                    mode="nearest",
         | 
| 141 | 
            +
                                    align_corners=None,
         | 
| 142 | 
            +
                                    recompute_scale_factor=None,
         | 
| 143 | 
            +
                                )
         | 
| 144 | 
            +
                                .squeeze(0)
         | 
| 145 | 
            +
                                .transpose(0, 1)
         | 
| 146 | 
            +
                            )
         | 
| 147 | 
            +
                            # remove paddings
         | 
| 148 | 
            +
                            alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
         | 
| 149 | 
            +
                            # set file paths
         | 
| 150 | 
            +
                            wav_file_name = os.path.basename(item_idx)
         | 
| 151 | 
            +
                            align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
         | 
| 152 | 
            +
                            file_path = item_idx.replace(wav_file_name, align_file_name)
         | 
| 153 | 
            +
                            # save output
         | 
| 154 | 
            +
                            wav_file_abs_path = os.path.abspath(item_idx)
         | 
| 155 | 
            +
                            file_abs_path = os.path.abspath(file_path)
         | 
| 156 | 
            +
                            file_paths.append([wav_file_abs_path, file_abs_path])
         | 
| 157 | 
            +
                            np.save(file_path, alignment)
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                    # ourput metafile
         | 
| 160 | 
            +
                    metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
         | 
| 161 | 
            +
             | 
| 162 | 
            +
                    with open(metafile, "w", encoding="utf-8") as f:
         | 
| 163 | 
            +
                        for p in file_paths:
         | 
| 164 | 
            +
                            f.write(f"{p[0]}|{p[1]}\n")
         | 
| 165 | 
            +
                    print(f" >> Metafile created: {metafile}")
         | 
    	
        TTS/bin/compute_embeddings.py
    ADDED
    
    | @@ -0,0 +1,197 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import argparse
         | 
| 2 | 
            +
            import os
         | 
| 3 | 
            +
            from argparse import RawTextHelpFormatter
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            import torch
         | 
| 6 | 
            +
            from tqdm import tqdm
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            from TTS.config import load_config
         | 
| 9 | 
            +
            from TTS.config.shared_configs import BaseDatasetConfig
         | 
| 10 | 
            +
            from TTS.tts.datasets import load_tts_samples
         | 
| 11 | 
            +
            from TTS.tts.utils.managers import save_file
         | 
| 12 | 
            +
            from TTS.tts.utils.speakers import SpeakerManager
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            def compute_embeddings(
         | 
| 16 | 
            +
                model_path,
         | 
| 17 | 
            +
                config_path,
         | 
| 18 | 
            +
                output_path,
         | 
| 19 | 
            +
                old_speakers_file=None,
         | 
| 20 | 
            +
                old_append=False,
         | 
| 21 | 
            +
                config_dataset_path=None,
         | 
| 22 | 
            +
                formatter_name=None,
         | 
| 23 | 
            +
                dataset_name=None,
         | 
| 24 | 
            +
                dataset_path=None,
         | 
| 25 | 
            +
                meta_file_train=None,
         | 
| 26 | 
            +
                meta_file_val=None,
         | 
| 27 | 
            +
                disable_cuda=False,
         | 
| 28 | 
            +
                no_eval=False,
         | 
| 29 | 
            +
            ):
         | 
| 30 | 
            +
                use_cuda = torch.cuda.is_available() and not disable_cuda
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                if config_dataset_path is not None:
         | 
| 33 | 
            +
                    c_dataset = load_config(config_dataset_path)
         | 
| 34 | 
            +
                    meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
         | 
| 35 | 
            +
                else:
         | 
| 36 | 
            +
                    c_dataset = BaseDatasetConfig()
         | 
| 37 | 
            +
                    c_dataset.formatter = formatter_name
         | 
| 38 | 
            +
                    c_dataset.dataset_name = dataset_name
         | 
| 39 | 
            +
                    c_dataset.path = dataset_path
         | 
| 40 | 
            +
                    if meta_file_train is not None:
         | 
| 41 | 
            +
                        c_dataset.meta_file_train = meta_file_train
         | 
| 42 | 
            +
                    if meta_file_val is not None:
         | 
| 43 | 
            +
                        c_dataset.meta_file_val = meta_file_val
         | 
| 44 | 
            +
                    meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                if meta_data_eval is None:
         | 
| 47 | 
            +
                    samples = meta_data_train
         | 
| 48 | 
            +
                else:
         | 
| 49 | 
            +
                    samples = meta_data_train + meta_data_eval
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                encoder_manager = SpeakerManager(
         | 
| 52 | 
            +
                    encoder_model_path=model_path,
         | 
| 53 | 
            +
                    encoder_config_path=config_path,
         | 
| 54 | 
            +
                    d_vectors_file_path=old_speakers_file,
         | 
| 55 | 
            +
                    use_cuda=use_cuda,
         | 
| 56 | 
            +
                )
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                class_name_key = encoder_manager.encoder_config.class_name_key
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                # compute speaker embeddings
         | 
| 61 | 
            +
                if old_speakers_file is not None and old_append:
         | 
| 62 | 
            +
                    speaker_mapping = encoder_manager.embeddings
         | 
| 63 | 
            +
                else:
         | 
| 64 | 
            +
                    speaker_mapping = {}
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                for fields in tqdm(samples):
         | 
| 67 | 
            +
                    class_name = fields[class_name_key]
         | 
| 68 | 
            +
                    audio_file = fields["audio_file"]
         | 
| 69 | 
            +
                    embedding_key = fields["audio_unique_name"]
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                    # Only update the speaker name when the embedding is already in the old file.
         | 
| 72 | 
            +
                    if embedding_key in speaker_mapping:
         | 
| 73 | 
            +
                        speaker_mapping[embedding_key]["name"] = class_name
         | 
| 74 | 
            +
                        continue
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                    if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
         | 
| 77 | 
            +
                        # get the embedding from the old file
         | 
| 78 | 
            +
                        embedd = encoder_manager.get_embedding_by_clip(embedding_key)
         | 
| 79 | 
            +
                    else:
         | 
| 80 | 
            +
                        # extract the embedding
         | 
| 81 | 
            +
                        embedd = encoder_manager.compute_embedding_from_clip(audio_file)
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                    # create speaker_mapping if target dataset is defined
         | 
| 84 | 
            +
                    speaker_mapping[embedding_key] = {}
         | 
| 85 | 
            +
                    speaker_mapping[embedding_key]["name"] = class_name
         | 
| 86 | 
            +
                    speaker_mapping[embedding_key]["embedding"] = embedd
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                if speaker_mapping:
         | 
| 89 | 
            +
                    # save speaker_mapping if target dataset is defined
         | 
| 90 | 
            +
                    if os.path.isdir(output_path):
         | 
| 91 | 
            +
                        mapping_file_path = os.path.join(output_path, "speakers.pth")
         | 
| 92 | 
            +
                    else:
         | 
| 93 | 
            +
                        mapping_file_path = output_path
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                    if os.path.dirname(mapping_file_path) != "":
         | 
| 96 | 
            +
                        os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                    save_file(speaker_mapping, mapping_file_path)
         | 
| 99 | 
            +
                    print("Speaker embeddings saved at:", mapping_file_path)
         | 
| 100 | 
            +
             | 
| 101 | 
            +
             | 
| 102 | 
            +
            if __name__ == "__main__":
         | 
| 103 | 
            +
                parser = argparse.ArgumentParser(
         | 
| 104 | 
            +
                    description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
         | 
| 105 | 
            +
                    """
         | 
| 106 | 
            +
                    Example runs:
         | 
| 107 | 
            +
                    python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                    python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
         | 
| 110 | 
            +
                    """,
         | 
| 111 | 
            +
                    formatter_class=RawTextHelpFormatter,
         | 
| 112 | 
            +
                )
         | 
| 113 | 
            +
                parser.add_argument(
         | 
| 114 | 
            +
                    "--model_path",
         | 
| 115 | 
            +
                    type=str,
         | 
| 116 | 
            +
                    help="Path to model checkpoint file. It defaults to the released speaker encoder.",
         | 
| 117 | 
            +
                    default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
         | 
| 118 | 
            +
                )
         | 
| 119 | 
            +
                parser.add_argument(
         | 
| 120 | 
            +
                    "--config_path",
         | 
| 121 | 
            +
                    type=str,
         | 
| 122 | 
            +
                    help="Path to model config file. It defaults to the released speaker encoder config.",
         | 
| 123 | 
            +
                    default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
         | 
| 124 | 
            +
                )
         | 
| 125 | 
            +
                parser.add_argument(
         | 
| 126 | 
            +
                    "--config_dataset_path",
         | 
| 127 | 
            +
                    type=str,
         | 
| 128 | 
            +
                    help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
         | 
| 129 | 
            +
                    default=None,
         | 
| 130 | 
            +
                )
         | 
| 131 | 
            +
                parser.add_argument(
         | 
| 132 | 
            +
                    "--output_path",
         | 
| 133 | 
            +
                    type=str,
         | 
| 134 | 
            +
                    help="Path for output `pth` or `json` file.",
         | 
| 135 | 
            +
                    default="speakers.pth",
         | 
| 136 | 
            +
                )
         | 
| 137 | 
            +
                parser.add_argument(
         | 
| 138 | 
            +
                    "--old_file",
         | 
| 139 | 
            +
                    type=str,
         | 
| 140 | 
            +
                    help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
         | 
| 141 | 
            +
                    default=None,
         | 
| 142 | 
            +
                )
         | 
| 143 | 
            +
                parser.add_argument(
         | 
| 144 | 
            +
                    "--old_append",
         | 
| 145 | 
            +
                    help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
         | 
| 146 | 
            +
                    default=False,
         | 
| 147 | 
            +
                    action="store_true",
         | 
| 148 | 
            +
                )
         | 
| 149 | 
            +
                parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
         | 
| 150 | 
            +
                parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
         | 
| 151 | 
            +
                parser.add_argument(
         | 
| 152 | 
            +
                    "--formatter_name",
         | 
| 153 | 
            +
                    type=str,
         | 
| 154 | 
            +
                    help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
         | 
| 155 | 
            +
                    default=None,
         | 
| 156 | 
            +
                )
         | 
| 157 | 
            +
                parser.add_argument(
         | 
| 158 | 
            +
                    "--dataset_name",
         | 
| 159 | 
            +
                    type=str,
         | 
| 160 | 
            +
                    help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
         | 
| 161 | 
            +
                    default=None,
         | 
| 162 | 
            +
                )
         | 
| 163 | 
            +
                parser.add_argument(
         | 
| 164 | 
            +
                    "--dataset_path",
         | 
| 165 | 
            +
                    type=str,
         | 
| 166 | 
            +
                    help="Path to the dataset. You either need to provide this or `config_dataset_path`",
         | 
| 167 | 
            +
                    default=None,
         | 
| 168 | 
            +
                )
         | 
| 169 | 
            +
                parser.add_argument(
         | 
| 170 | 
            +
                    "--meta_file_train",
         | 
| 171 | 
            +
                    type=str,
         | 
| 172 | 
            +
                    help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
         | 
| 173 | 
            +
                    default=None,
         | 
| 174 | 
            +
                )
         | 
| 175 | 
            +
                parser.add_argument(
         | 
| 176 | 
            +
                    "--meta_file_val",
         | 
| 177 | 
            +
                    type=str,
         | 
| 178 | 
            +
                    help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
         | 
| 179 | 
            +
                    default=None,
         | 
| 180 | 
            +
                )
         | 
| 181 | 
            +
                args = parser.parse_args()
         | 
| 182 | 
            +
             | 
| 183 | 
            +
                compute_embeddings(
         | 
| 184 | 
            +
                    args.model_path,
         | 
| 185 | 
            +
                    args.config_path,
         | 
| 186 | 
            +
                    args.output_path,
         | 
| 187 | 
            +
                    old_speakers_file=args.old_file,
         | 
| 188 | 
            +
                    old_append=args.old_append,
         | 
| 189 | 
            +
                    config_dataset_path=args.config_dataset_path,
         | 
| 190 | 
            +
                    formatter_name=args.formatter_name,
         | 
| 191 | 
            +
                    dataset_name=args.dataset_name,
         | 
| 192 | 
            +
                    dataset_path=args.dataset_path,
         | 
| 193 | 
            +
                    meta_file_train=args.meta_file_train,
         | 
| 194 | 
            +
                    meta_file_val=args.meta_file_val,
         | 
| 195 | 
            +
                    disable_cuda=args.disable_cuda,
         | 
| 196 | 
            +
                    no_eval=args.no_eval,
         | 
| 197 | 
            +
                )
         | 
    	
        TTS/bin/compute_statistics.py
    ADDED
    
    | @@ -0,0 +1,96 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/usr/bin/env python3
         | 
| 2 | 
            +
            # -*- coding: utf-8 -*-
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import argparse
         | 
| 5 | 
            +
            import glob
         | 
| 6 | 
            +
            import os
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            import numpy as np
         | 
| 9 | 
            +
            from tqdm import tqdm
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            # from TTS.utils.io import load_config
         | 
| 12 | 
            +
            from TTS.config import load_config
         | 
| 13 | 
            +
            from TTS.tts.datasets import load_tts_samples
         | 
| 14 | 
            +
            from TTS.utils.audio import AudioProcessor
         | 
| 15 | 
            +
             | 
| 16 | 
            +
             | 
| 17 | 
            +
            def main():
         | 
| 18 | 
            +
                """Run preprocessing process."""
         | 
| 19 | 
            +
                parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
         | 
| 20 | 
            +
                parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
         | 
| 21 | 
            +
                parser.add_argument("out_path", type=str, help="save path (directory and filename).")
         | 
| 22 | 
            +
                parser.add_argument(
         | 
| 23 | 
            +
                    "--data_path",
         | 
| 24 | 
            +
                    type=str,
         | 
| 25 | 
            +
                    required=False,
         | 
| 26 | 
            +
                    help="folder including the target set of wavs overriding dataset config.",
         | 
| 27 | 
            +
                )
         | 
| 28 | 
            +
                args, overrides = parser.parse_known_args()
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                CONFIG = load_config(args.config_path)
         | 
| 31 | 
            +
                CONFIG.parse_known_args(overrides, relaxed_parser=True)
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                # load config
         | 
| 34 | 
            +
                CONFIG.audio.signal_norm = False  # do not apply earlier normalization
         | 
| 35 | 
            +
                CONFIG.audio.stats_path = None  # discard pre-defined stats
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                # load audio processor
         | 
| 38 | 
            +
                ap = AudioProcessor(**CONFIG.audio.to_dict())
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                # load the meta data of target dataset
         | 
| 41 | 
            +
                if args.data_path:
         | 
| 42 | 
            +
                    dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
         | 
| 43 | 
            +
                else:
         | 
| 44 | 
            +
                    dataset_items = load_tts_samples(CONFIG.datasets)[0]  # take only train data
         | 
| 45 | 
            +
                print(f" > There are {len(dataset_items)} files.")
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                mel_sum = 0
         | 
| 48 | 
            +
                mel_square_sum = 0
         | 
| 49 | 
            +
                linear_sum = 0
         | 
| 50 | 
            +
                linear_square_sum = 0
         | 
| 51 | 
            +
                N = 0
         | 
| 52 | 
            +
                for item in tqdm(dataset_items):
         | 
| 53 | 
            +
                    # compute features
         | 
| 54 | 
            +
                    wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
         | 
| 55 | 
            +
                    linear = ap.spectrogram(wav)
         | 
| 56 | 
            +
                    mel = ap.melspectrogram(wav)
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                    # compute stats
         | 
| 59 | 
            +
                    N += mel.shape[1]
         | 
| 60 | 
            +
                    mel_sum += mel.sum(1)
         | 
| 61 | 
            +
                    linear_sum += linear.sum(1)
         | 
| 62 | 
            +
                    mel_square_sum += (mel**2).sum(axis=1)
         | 
| 63 | 
            +
                    linear_square_sum += (linear**2).sum(axis=1)
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                mel_mean = mel_sum / N
         | 
| 66 | 
            +
                mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
         | 
| 67 | 
            +
                linear_mean = linear_sum / N
         | 
| 68 | 
            +
                linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                output_file_path = args.out_path
         | 
| 71 | 
            +
                stats = {}
         | 
| 72 | 
            +
                stats["mel_mean"] = mel_mean
         | 
| 73 | 
            +
                stats["mel_std"] = mel_scale
         | 
| 74 | 
            +
                stats["linear_mean"] = linear_mean
         | 
| 75 | 
            +
                stats["linear_std"] = linear_scale
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                print(f" > Avg mel spec mean: {mel_mean.mean()}")
         | 
| 78 | 
            +
                print(f" > Avg mel spec scale: {mel_scale.mean()}")
         | 
| 79 | 
            +
                print(f" > Avg linear spec mean: {linear_mean.mean()}")
         | 
| 80 | 
            +
                print(f" > Avg linear spec scale: {linear_scale.mean()}")
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                # set default config values for mean-var scaling
         | 
| 83 | 
            +
                CONFIG.audio.stats_path = output_file_path
         | 
| 84 | 
            +
                CONFIG.audio.signal_norm = True
         | 
| 85 | 
            +
                # remove redundant values
         | 
| 86 | 
            +
                del CONFIG.audio.max_norm
         | 
| 87 | 
            +
                del CONFIG.audio.min_level_db
         | 
| 88 | 
            +
                del CONFIG.audio.symmetric_norm
         | 
| 89 | 
            +
                del CONFIG.audio.clip_norm
         | 
| 90 | 
            +
                stats["audio_config"] = CONFIG.audio.to_dict()
         | 
| 91 | 
            +
                np.save(output_file_path, stats, allow_pickle=True)
         | 
| 92 | 
            +
                print(f" > stats saved to {output_file_path}")
         | 
| 93 | 
            +
             | 
| 94 | 
            +
             | 
| 95 | 
            +
            if __name__ == "__main__":
         | 
| 96 | 
            +
                main()
         | 
    	
        TTS/bin/eval_encoder.py
    ADDED
    
    | @@ -0,0 +1,88 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import argparse
         | 
| 2 | 
            +
            from argparse import RawTextHelpFormatter
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import torch
         | 
| 5 | 
            +
            from tqdm import tqdm
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            from TTS.config import load_config
         | 
| 8 | 
            +
            from TTS.tts.datasets import load_tts_samples
         | 
| 9 | 
            +
            from TTS.tts.utils.speakers import SpeakerManager
         | 
| 10 | 
            +
             | 
| 11 | 
            +
             | 
| 12 | 
            +
            def compute_encoder_accuracy(dataset_items, encoder_manager):
         | 
| 13 | 
            +
                class_name_key = encoder_manager.encoder_config.class_name_key
         | 
| 14 | 
            +
                map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                class_acc_dict = {}
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                # compute embeddings for all wav_files
         | 
| 19 | 
            +
                for item in tqdm(dataset_items):
         | 
| 20 | 
            +
                    class_name = item[class_name_key]
         | 
| 21 | 
            +
                    wav_file = item["audio_file"]
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                    # extract the embedding
         | 
| 24 | 
            +
                    embedd = encoder_manager.compute_embedding_from_clip(wav_file)
         | 
| 25 | 
            +
                    if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
         | 
| 26 | 
            +
                        embedding = torch.FloatTensor(embedd).unsqueeze(0)
         | 
| 27 | 
            +
                        if encoder_manager.use_cuda:
         | 
| 28 | 
            +
                            embedding = embedding.cuda()
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                        class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
         | 
| 31 | 
            +
                        predicted_label = map_classid_to_classname[str(class_id)]
         | 
| 32 | 
            +
                    else:
         | 
| 33 | 
            +
                        predicted_label = None
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    if class_name is not None and predicted_label is not None:
         | 
| 36 | 
            +
                        is_equal = int(class_name == predicted_label)
         | 
| 37 | 
            +
                        if class_name not in class_acc_dict:
         | 
| 38 | 
            +
                            class_acc_dict[class_name] = [is_equal]
         | 
| 39 | 
            +
                        else:
         | 
| 40 | 
            +
                            class_acc_dict[class_name].append(is_equal)
         | 
| 41 | 
            +
                    else:
         | 
| 42 | 
            +
                        raise RuntimeError("Error: class_name or/and predicted_label are None")
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                acc_avg = 0
         | 
| 45 | 
            +
                for key, values in class_acc_dict.items():
         | 
| 46 | 
            +
                    acc = sum(values) / len(values)
         | 
| 47 | 
            +
                    print("Class", key, "Accuracy:", acc)
         | 
| 48 | 
            +
                    acc_avg += acc
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                print("Average Accuracy:", acc_avg / len(class_acc_dict))
         | 
| 51 | 
            +
             | 
| 52 | 
            +
             | 
| 53 | 
            +
            if __name__ == "__main__":
         | 
| 54 | 
            +
                parser = argparse.ArgumentParser(
         | 
| 55 | 
            +
                    description="""Compute the accuracy of the encoder.\n\n"""
         | 
| 56 | 
            +
                    """
         | 
| 57 | 
            +
                    Example runs:
         | 
| 58 | 
            +
                    python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json  dataset_config.json
         | 
| 59 | 
            +
                    """,
         | 
| 60 | 
            +
                    formatter_class=RawTextHelpFormatter,
         | 
| 61 | 
            +
                )
         | 
| 62 | 
            +
                parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
         | 
| 63 | 
            +
                parser.add_argument(
         | 
| 64 | 
            +
                    "config_path",
         | 
| 65 | 
            +
                    type=str,
         | 
| 66 | 
            +
                    help="Path to model config file.",
         | 
| 67 | 
            +
                )
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                parser.add_argument(
         | 
| 70 | 
            +
                    "config_dataset_path",
         | 
| 71 | 
            +
                    type=str,
         | 
| 72 | 
            +
                    help="Path to dataset config file.",
         | 
| 73 | 
            +
                )
         | 
| 74 | 
            +
                parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
         | 
| 75 | 
            +
                parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                args = parser.parse_args()
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                c_dataset = load_config(args.config_dataset_path)
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
         | 
| 82 | 
            +
                items = meta_data_train + meta_data_eval
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                enc_manager = SpeakerManager(
         | 
| 85 | 
            +
                    encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
         | 
| 86 | 
            +
                )
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                compute_encoder_accuracy(items, enc_manager)
         | 
    	
        TTS/bin/extract_tts_spectrograms.py
    ADDED
    
    | @@ -0,0 +1,287 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/usr/bin/env python3
         | 
| 2 | 
            +
            """Extract Mel spectrograms with teacher forcing."""
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import argparse
         | 
| 5 | 
            +
            import os
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            import numpy as np
         | 
| 8 | 
            +
            import torch
         | 
| 9 | 
            +
            from torch.utils.data import DataLoader
         | 
| 10 | 
            +
            from tqdm import tqdm
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            from TTS.config import load_config
         | 
| 13 | 
            +
            from TTS.tts.datasets import TTSDataset, load_tts_samples
         | 
| 14 | 
            +
            from TTS.tts.models import setup_model
         | 
| 15 | 
            +
            from TTS.tts.utils.speakers import SpeakerManager
         | 
| 16 | 
            +
            from TTS.tts.utils.text.tokenizer import TTSTokenizer
         | 
| 17 | 
            +
            from TTS.utils.audio import AudioProcessor
         | 
| 18 | 
            +
            from TTS.utils.audio.numpy_transforms import quantize
         | 
| 19 | 
            +
            from TTS.utils.generic_utils import count_parameters
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            use_cuda = torch.cuda.is_available()
         | 
| 22 | 
            +
             | 
| 23 | 
            +
             | 
| 24 | 
            +
            def setup_loader(ap, r, verbose=False):
         | 
| 25 | 
            +
                tokenizer, _ = TTSTokenizer.init_from_config(c)
         | 
| 26 | 
            +
                dataset = TTSDataset(
         | 
| 27 | 
            +
                    outputs_per_step=r,
         | 
| 28 | 
            +
                    compute_linear_spec=False,
         | 
| 29 | 
            +
                    samples=meta_data,
         | 
| 30 | 
            +
                    tokenizer=tokenizer,
         | 
| 31 | 
            +
                    ap=ap,
         | 
| 32 | 
            +
                    batch_group_size=0,
         | 
| 33 | 
            +
                    min_text_len=c.min_text_len,
         | 
| 34 | 
            +
                    max_text_len=c.max_text_len,
         | 
| 35 | 
            +
                    min_audio_len=c.min_audio_len,
         | 
| 36 | 
            +
                    max_audio_len=c.max_audio_len,
         | 
| 37 | 
            +
                    phoneme_cache_path=c.phoneme_cache_path,
         | 
| 38 | 
            +
                    precompute_num_workers=0,
         | 
| 39 | 
            +
                    use_noise_augment=False,
         | 
| 40 | 
            +
                    verbose=verbose,
         | 
| 41 | 
            +
                    speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
         | 
| 42 | 
            +
                    d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
         | 
| 43 | 
            +
                )
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                if c.use_phonemes and c.compute_input_seq_cache:
         | 
| 46 | 
            +
                    # precompute phonemes to have a better estimate of sequence lengths.
         | 
| 47 | 
            +
                    dataset.compute_input_seq(c.num_loader_workers)
         | 
| 48 | 
            +
                dataset.preprocess_samples()
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                loader = DataLoader(
         | 
| 51 | 
            +
                    dataset,
         | 
| 52 | 
            +
                    batch_size=c.batch_size,
         | 
| 53 | 
            +
                    shuffle=False,
         | 
| 54 | 
            +
                    collate_fn=dataset.collate_fn,
         | 
| 55 | 
            +
                    drop_last=False,
         | 
| 56 | 
            +
                    sampler=None,
         | 
| 57 | 
            +
                    num_workers=c.num_loader_workers,
         | 
| 58 | 
            +
                    pin_memory=False,
         | 
| 59 | 
            +
                )
         | 
| 60 | 
            +
                return loader
         | 
| 61 | 
            +
             | 
| 62 | 
            +
             | 
| 63 | 
            +
            def set_filename(wav_path, out_path):
         | 
| 64 | 
            +
                wav_file = os.path.basename(wav_path)
         | 
| 65 | 
            +
                file_name = wav_file.split(".")[0]
         | 
| 66 | 
            +
                os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
         | 
| 67 | 
            +
                os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
         | 
| 68 | 
            +
                os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
         | 
| 69 | 
            +
                os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
         | 
| 70 | 
            +
                wavq_path = os.path.join(out_path, "quant", file_name)
         | 
| 71 | 
            +
                mel_path = os.path.join(out_path, "mel", file_name)
         | 
| 72 | 
            +
                wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
         | 
| 73 | 
            +
                wav_path = os.path.join(out_path, "wav", file_name + ".wav")
         | 
| 74 | 
            +
                return file_name, wavq_path, mel_path, wav_gl_path, wav_path
         | 
| 75 | 
            +
             | 
| 76 | 
            +
             | 
| 77 | 
            +
            def format_data(data):
         | 
| 78 | 
            +
                # setup input data
         | 
| 79 | 
            +
                text_input = data["token_id"]
         | 
| 80 | 
            +
                text_lengths = data["token_id_lengths"]
         | 
| 81 | 
            +
                mel_input = data["mel"]
         | 
| 82 | 
            +
                mel_lengths = data["mel_lengths"]
         | 
| 83 | 
            +
                item_idx = data["item_idxs"]
         | 
| 84 | 
            +
                d_vectors = data["d_vectors"]
         | 
| 85 | 
            +
                speaker_ids = data["speaker_ids"]
         | 
| 86 | 
            +
                attn_mask = data["attns"]
         | 
| 87 | 
            +
                avg_text_length = torch.mean(text_lengths.float())
         | 
| 88 | 
            +
                avg_spec_length = torch.mean(mel_lengths.float())
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                # dispatch data to GPU
         | 
| 91 | 
            +
                if use_cuda:
         | 
| 92 | 
            +
                    text_input = text_input.cuda(non_blocking=True)
         | 
| 93 | 
            +
                    text_lengths = text_lengths.cuda(non_blocking=True)
         | 
| 94 | 
            +
                    mel_input = mel_input.cuda(non_blocking=True)
         | 
| 95 | 
            +
                    mel_lengths = mel_lengths.cuda(non_blocking=True)
         | 
| 96 | 
            +
                    if speaker_ids is not None:
         | 
| 97 | 
            +
                        speaker_ids = speaker_ids.cuda(non_blocking=True)
         | 
| 98 | 
            +
                    if d_vectors is not None:
         | 
| 99 | 
            +
                        d_vectors = d_vectors.cuda(non_blocking=True)
         | 
| 100 | 
            +
                    if attn_mask is not None:
         | 
| 101 | 
            +
                        attn_mask = attn_mask.cuda(non_blocking=True)
         | 
| 102 | 
            +
                return (
         | 
| 103 | 
            +
                    text_input,
         | 
| 104 | 
            +
                    text_lengths,
         | 
| 105 | 
            +
                    mel_input,
         | 
| 106 | 
            +
                    mel_lengths,
         | 
| 107 | 
            +
                    speaker_ids,
         | 
| 108 | 
            +
                    d_vectors,
         | 
| 109 | 
            +
                    avg_text_length,
         | 
| 110 | 
            +
                    avg_spec_length,
         | 
| 111 | 
            +
                    attn_mask,
         | 
| 112 | 
            +
                    item_idx,
         | 
| 113 | 
            +
                )
         | 
| 114 | 
            +
             | 
| 115 | 
            +
             | 
| 116 | 
            +
            @torch.no_grad()
         | 
| 117 | 
            +
            def inference(
         | 
| 118 | 
            +
                model_name,
         | 
| 119 | 
            +
                model,
         | 
| 120 | 
            +
                ap,
         | 
| 121 | 
            +
                text_input,
         | 
| 122 | 
            +
                text_lengths,
         | 
| 123 | 
            +
                mel_input,
         | 
| 124 | 
            +
                mel_lengths,
         | 
| 125 | 
            +
                speaker_ids=None,
         | 
| 126 | 
            +
                d_vectors=None,
         | 
| 127 | 
            +
            ):
         | 
| 128 | 
            +
                if model_name == "glow_tts":
         | 
| 129 | 
            +
                    speaker_c = None
         | 
| 130 | 
            +
                    if speaker_ids is not None:
         | 
| 131 | 
            +
                        speaker_c = speaker_ids
         | 
| 132 | 
            +
                    elif d_vectors is not None:
         | 
| 133 | 
            +
                        speaker_c = d_vectors
         | 
| 134 | 
            +
                    outputs = model.inference_with_MAS(
         | 
| 135 | 
            +
                        text_input,
         | 
| 136 | 
            +
                        text_lengths,
         | 
| 137 | 
            +
                        mel_input,
         | 
| 138 | 
            +
                        mel_lengths,
         | 
| 139 | 
            +
                        aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
         | 
| 140 | 
            +
                    )
         | 
| 141 | 
            +
                    model_output = outputs["model_outputs"]
         | 
| 142 | 
            +
                    model_output = model_output.detach().cpu().numpy()
         | 
| 143 | 
            +
             | 
| 144 | 
            +
                elif "tacotron" in model_name:
         | 
| 145 | 
            +
                    aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
         | 
| 146 | 
            +
                    outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
         | 
| 147 | 
            +
                    postnet_outputs = outputs["model_outputs"]
         | 
| 148 | 
            +
                    # normalize tacotron output
         | 
| 149 | 
            +
                    if model_name == "tacotron":
         | 
| 150 | 
            +
                        mel_specs = []
         | 
| 151 | 
            +
                        postnet_outputs = postnet_outputs.data.cpu().numpy()
         | 
| 152 | 
            +
                        for b in range(postnet_outputs.shape[0]):
         | 
| 153 | 
            +
                            postnet_output = postnet_outputs[b]
         | 
| 154 | 
            +
                            mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
         | 
| 155 | 
            +
                        model_output = torch.stack(mel_specs).cpu().numpy()
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                    elif model_name == "tacotron2":
         | 
| 158 | 
            +
                        model_output = postnet_outputs.detach().cpu().numpy()
         | 
| 159 | 
            +
                return model_output
         | 
| 160 | 
            +
             | 
| 161 | 
            +
             | 
| 162 | 
            +
            def extract_spectrograms(
         | 
| 163 | 
            +
                data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
         | 
| 164 | 
            +
            ):
         | 
| 165 | 
            +
                model.eval()
         | 
| 166 | 
            +
                export_metadata = []
         | 
| 167 | 
            +
                for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
         | 
| 168 | 
            +
                    # format data
         | 
| 169 | 
            +
                    (
         | 
| 170 | 
            +
                        text_input,
         | 
| 171 | 
            +
                        text_lengths,
         | 
| 172 | 
            +
                        mel_input,
         | 
| 173 | 
            +
                        mel_lengths,
         | 
| 174 | 
            +
                        speaker_ids,
         | 
| 175 | 
            +
                        d_vectors,
         | 
| 176 | 
            +
                        _,
         | 
| 177 | 
            +
                        _,
         | 
| 178 | 
            +
                        _,
         | 
| 179 | 
            +
                        item_idx,
         | 
| 180 | 
            +
                    ) = format_data(data)
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                    model_output = inference(
         | 
| 183 | 
            +
                        c.model.lower(),
         | 
| 184 | 
            +
                        model,
         | 
| 185 | 
            +
                        ap,
         | 
| 186 | 
            +
                        text_input,
         | 
| 187 | 
            +
                        text_lengths,
         | 
| 188 | 
            +
                        mel_input,
         | 
| 189 | 
            +
                        mel_lengths,
         | 
| 190 | 
            +
                        speaker_ids,
         | 
| 191 | 
            +
                        d_vectors,
         | 
| 192 | 
            +
                    )
         | 
| 193 | 
            +
             | 
| 194 | 
            +
                    for idx in range(text_input.shape[0]):
         | 
| 195 | 
            +
                        wav_file_path = item_idx[idx]
         | 
| 196 | 
            +
                        wav = ap.load_wav(wav_file_path)
         | 
| 197 | 
            +
                        _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
         | 
| 198 | 
            +
             | 
| 199 | 
            +
                        # quantize and save wav
         | 
| 200 | 
            +
                        if quantize_bits > 0:
         | 
| 201 | 
            +
                            wavq = quantize(wav, quantize_bits)
         | 
| 202 | 
            +
                            np.save(wavq_path, wavq)
         | 
| 203 | 
            +
             | 
| 204 | 
            +
                        # save TTS mel
         | 
| 205 | 
            +
                        mel = model_output[idx]
         | 
| 206 | 
            +
                        mel_length = mel_lengths[idx]
         | 
| 207 | 
            +
                        mel = mel[:mel_length, :].T
         | 
| 208 | 
            +
                        np.save(mel_path, mel)
         | 
| 209 | 
            +
             | 
| 210 | 
            +
                        export_metadata.append([wav_file_path, mel_path])
         | 
| 211 | 
            +
                        if save_audio:
         | 
| 212 | 
            +
                            ap.save_wav(wav, wav_path)
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                        if debug:
         | 
| 215 | 
            +
                            print("Audio for debug saved at:", wav_gl_path)
         | 
| 216 | 
            +
                            wav = ap.inv_melspectrogram(mel)
         | 
| 217 | 
            +
                            ap.save_wav(wav, wav_gl_path)
         | 
| 218 | 
            +
             | 
| 219 | 
            +
                with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
         | 
| 220 | 
            +
                    for data in export_metadata:
         | 
| 221 | 
            +
                        f.write(f"{data[0]}|{data[1]+'.npy'}\n")
         | 
| 222 | 
            +
             | 
| 223 | 
            +
             | 
| 224 | 
            +
            def main(args):  # pylint: disable=redefined-outer-name
         | 
| 225 | 
            +
                # pylint: disable=global-variable-undefined
         | 
| 226 | 
            +
                global meta_data, speaker_manager
         | 
| 227 | 
            +
             | 
| 228 | 
            +
                # Audio processor
         | 
| 229 | 
            +
                ap = AudioProcessor(**c.audio)
         | 
| 230 | 
            +
             | 
| 231 | 
            +
                # load data instances
         | 
| 232 | 
            +
                meta_data_train, meta_data_eval = load_tts_samples(
         | 
| 233 | 
            +
                    c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
         | 
| 234 | 
            +
                )
         | 
| 235 | 
            +
             | 
| 236 | 
            +
                # use eval and training partitions
         | 
| 237 | 
            +
                meta_data = meta_data_train + meta_data_eval
         | 
| 238 | 
            +
             | 
| 239 | 
            +
                # init speaker manager
         | 
| 240 | 
            +
                if c.use_speaker_embedding:
         | 
| 241 | 
            +
                    speaker_manager = SpeakerManager(data_items=meta_data)
         | 
| 242 | 
            +
                elif c.use_d_vector_file:
         | 
| 243 | 
            +
                    speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
         | 
| 244 | 
            +
                else:
         | 
| 245 | 
            +
                    speaker_manager = None
         | 
| 246 | 
            +
             | 
| 247 | 
            +
                # setup model
         | 
| 248 | 
            +
                model = setup_model(c)
         | 
| 249 | 
            +
             | 
| 250 | 
            +
                # restore model
         | 
| 251 | 
            +
                model.load_checkpoint(c, args.checkpoint_path, eval=True)
         | 
| 252 | 
            +
             | 
| 253 | 
            +
                if use_cuda:
         | 
| 254 | 
            +
                    model.cuda()
         | 
| 255 | 
            +
             | 
| 256 | 
            +
                num_params = count_parameters(model)
         | 
| 257 | 
            +
                print("\n > Model has {} parameters".format(num_params), flush=True)
         | 
| 258 | 
            +
                # set r
         | 
| 259 | 
            +
                r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
         | 
| 260 | 
            +
                own_loader = setup_loader(ap, r, verbose=True)
         | 
| 261 | 
            +
             | 
| 262 | 
            +
                extract_spectrograms(
         | 
| 263 | 
            +
                    own_loader,
         | 
| 264 | 
            +
                    model,
         | 
| 265 | 
            +
                    ap,
         | 
| 266 | 
            +
                    args.output_path,
         | 
| 267 | 
            +
                    quantize_bits=args.quantize_bits,
         | 
| 268 | 
            +
                    save_audio=args.save_audio,
         | 
| 269 | 
            +
                    debug=args.debug,
         | 
| 270 | 
            +
                    metada_name="metada.txt",
         | 
| 271 | 
            +
                )
         | 
| 272 | 
            +
             | 
| 273 | 
            +
             | 
| 274 | 
            +
            if __name__ == "__main__":
         | 
| 275 | 
            +
                parser = argparse.ArgumentParser()
         | 
| 276 | 
            +
                parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
         | 
| 277 | 
            +
                parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
         | 
| 278 | 
            +
                parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
         | 
| 279 | 
            +
                parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
         | 
| 280 | 
            +
                parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
         | 
| 281 | 
            +
                parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
         | 
| 282 | 
            +
                parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
         | 
| 283 | 
            +
                args = parser.parse_args()
         | 
| 284 | 
            +
             | 
| 285 | 
            +
                c = load_config(args.config_path)
         | 
| 286 | 
            +
                c.audio.trim_silence = False
         | 
| 287 | 
            +
                main(args)
         | 
    	
        TTS/bin/find_unique_chars.py
    ADDED
    
    | @@ -0,0 +1,45 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """Find all the unique characters in a dataset"""
         | 
| 2 | 
            +
            import argparse
         | 
| 3 | 
            +
            from argparse import RawTextHelpFormatter
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            from TTS.config import load_config
         | 
| 6 | 
            +
            from TTS.tts.datasets import load_tts_samples
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            def main():
         | 
| 10 | 
            +
                # pylint: disable=bad-option-value
         | 
| 11 | 
            +
                parser = argparse.ArgumentParser(
         | 
| 12 | 
            +
                    description="""Find all the unique characters or phonemes in a dataset.\n\n"""
         | 
| 13 | 
            +
                    """
         | 
| 14 | 
            +
                Example runs:
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                python TTS/bin/find_unique_chars.py --config_path config.json
         | 
| 17 | 
            +
                """,
         | 
| 18 | 
            +
                    formatter_class=RawTextHelpFormatter,
         | 
| 19 | 
            +
                )
         | 
| 20 | 
            +
                parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
         | 
| 21 | 
            +
                args = parser.parse_args()
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                c = load_config(args.config_path)
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                # load all datasets
         | 
| 26 | 
            +
                train_items, eval_items = load_tts_samples(
         | 
| 27 | 
            +
                    c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
         | 
| 28 | 
            +
                )
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                items = train_items + eval_items
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                texts = "".join(item["text"] for item in items)
         | 
| 33 | 
            +
                chars = set(texts)
         | 
| 34 | 
            +
                lower_chars = filter(lambda c: c.islower(), chars)
         | 
| 35 | 
            +
                chars_force_lower = [c.lower() for c in chars]
         | 
| 36 | 
            +
                chars_force_lower = set(chars_force_lower)
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                print(f" > Number of unique characters: {len(chars)}")
         | 
| 39 | 
            +
                print(f" > Unique characters: {''.join(sorted(chars))}")
         | 
| 40 | 
            +
                print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
         | 
| 41 | 
            +
                print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
         | 
| 42 | 
            +
             | 
| 43 | 
            +
             | 
| 44 | 
            +
            if __name__ == "__main__":
         | 
| 45 | 
            +
                main()
         | 
    	
        TTS/bin/find_unique_phonemes.py
    ADDED
    
    | @@ -0,0 +1,74 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """Find all the unique characters in a dataset"""
         | 
| 2 | 
            +
            import argparse
         | 
| 3 | 
            +
            import multiprocessing
         | 
| 4 | 
            +
            from argparse import RawTextHelpFormatter
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from tqdm.contrib.concurrent import process_map
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            from TTS.config import load_config
         | 
| 9 | 
            +
            from TTS.tts.datasets import load_tts_samples
         | 
| 10 | 
            +
            from TTS.tts.utils.text.phonemizers import Gruut
         | 
| 11 | 
            +
             | 
| 12 | 
            +
             | 
| 13 | 
            +
            def compute_phonemes(item):
         | 
| 14 | 
            +
                text = item["text"]
         | 
| 15 | 
            +
                ph = phonemizer.phonemize(text).replace("|", "")
         | 
| 16 | 
            +
                return set(list(ph))
         | 
| 17 | 
            +
             | 
| 18 | 
            +
             | 
| 19 | 
            +
            def main():
         | 
| 20 | 
            +
                # pylint: disable=W0601
         | 
| 21 | 
            +
                global c, phonemizer
         | 
| 22 | 
            +
                # pylint: disable=bad-option-value
         | 
| 23 | 
            +
                parser = argparse.ArgumentParser(
         | 
| 24 | 
            +
                    description="""Find all the unique characters or phonemes in a dataset.\n\n"""
         | 
| 25 | 
            +
                    """
         | 
| 26 | 
            +
                Example runs:
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                python TTS/bin/find_unique_phonemes.py --config_path config.json
         | 
| 29 | 
            +
                """,
         | 
| 30 | 
            +
                    formatter_class=RawTextHelpFormatter,
         | 
| 31 | 
            +
                )
         | 
| 32 | 
            +
                parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
         | 
| 33 | 
            +
                args = parser.parse_args()
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                c = load_config(args.config_path)
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                # load all datasets
         | 
| 38 | 
            +
                train_items, eval_items = load_tts_samples(
         | 
| 39 | 
            +
                    c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
         | 
| 40 | 
            +
                )
         | 
| 41 | 
            +
                items = train_items + eval_items
         | 
| 42 | 
            +
                print("Num items:", len(items))
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                language_list = [item["language"] for item in items]
         | 
| 45 | 
            +
                is_lang_def = all(language_list)
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                if not c.phoneme_language or not is_lang_def:
         | 
| 48 | 
            +
                    raise ValueError("Phoneme language must be defined in config.")
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                if not language_list.count(language_list[0]) == len(language_list):
         | 
| 51 | 
            +
                    raise ValueError(
         | 
| 52 | 
            +
                        "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
         | 
| 53 | 
            +
                    )
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                phonemizer = Gruut(language=language_list[0], keep_puncs=True)
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
         | 
| 58 | 
            +
                phones = []
         | 
| 59 | 
            +
                for ph in phonemes:
         | 
| 60 | 
            +
                    phones.extend(ph)
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                phones = set(phones)
         | 
| 63 | 
            +
                lower_phones = filter(lambda c: c.islower(), phones)
         | 
| 64 | 
            +
                phones_force_lower = [c.lower() for c in phones]
         | 
| 65 | 
            +
                phones_force_lower = set(phones_force_lower)
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                print(f" > Number of unique phonemes: {len(phones)}")
         | 
| 68 | 
            +
                print(f" > Unique phonemes: {''.join(sorted(phones))}")
         | 
| 69 | 
            +
                print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
         | 
| 70 | 
            +
                print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
         | 
| 71 | 
            +
             | 
| 72 | 
            +
             | 
| 73 | 
            +
            if __name__ == "__main__":
         | 
| 74 | 
            +
                main()
         | 
    	
        TTS/bin/remove_silence_using_vad.py
    ADDED
    
    | @@ -0,0 +1,124 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import argparse
         | 
| 2 | 
            +
            import glob
         | 
| 3 | 
            +
            import multiprocessing
         | 
| 4 | 
            +
            import os
         | 
| 5 | 
            +
            import pathlib
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            import torch
         | 
| 8 | 
            +
            from tqdm import tqdm
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            from TTS.utils.vad import get_vad_model_and_utils, remove_silence
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            torch.set_num_threads(1)
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            def adjust_path_and_remove_silence(audio_path):
         | 
| 16 | 
            +
                output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
         | 
| 17 | 
            +
                # ignore if the file exists
         | 
| 18 | 
            +
                if os.path.exists(output_path) and not args.force:
         | 
| 19 | 
            +
                    return output_path, False
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                # create all directory structure
         | 
| 22 | 
            +
                pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
         | 
| 23 | 
            +
                # remove the silence and save the audio
         | 
| 24 | 
            +
                output_path, is_speech = remove_silence(
         | 
| 25 | 
            +
                    model_and_utils,
         | 
| 26 | 
            +
                    audio_path,
         | 
| 27 | 
            +
                    output_path,
         | 
| 28 | 
            +
                    trim_just_beginning_and_end=args.trim_just_beginning_and_end,
         | 
| 29 | 
            +
                    use_cuda=args.use_cuda,
         | 
| 30 | 
            +
                )
         | 
| 31 | 
            +
                return output_path, is_speech
         | 
| 32 | 
            +
             | 
| 33 | 
            +
             | 
| 34 | 
            +
            def preprocess_audios():
         | 
| 35 | 
            +
                files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
         | 
| 36 | 
            +
                print("> Number of files: ", len(files))
         | 
| 37 | 
            +
                if not args.force:
         | 
| 38 | 
            +
                    print("> Ignoring files that already exist in the output idrectory.")
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                if args.trim_just_beginning_and_end:
         | 
| 41 | 
            +
                    print("> Trimming just the beginning and the end with nonspeech parts.")
         | 
| 42 | 
            +
                else:
         | 
| 43 | 
            +
                    print("> Trimming all nonspeech parts.")
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                filtered_files = []
         | 
| 46 | 
            +
                if files:
         | 
| 47 | 
            +
                    # create threads
         | 
| 48 | 
            +
                    # num_threads = multiprocessing.cpu_count()
         | 
| 49 | 
            +
                    # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                    if args.num_processes > 1:
         | 
| 52 | 
            +
                        with multiprocessing.Pool(processes=args.num_processes) as pool:
         | 
| 53 | 
            +
                            results = list(
         | 
| 54 | 
            +
                                tqdm(
         | 
| 55 | 
            +
                                    pool.imap_unordered(adjust_path_and_remove_silence, files),
         | 
| 56 | 
            +
                                    total=len(files),
         | 
| 57 | 
            +
                                    desc="Processing audio files",
         | 
| 58 | 
            +
                                )
         | 
| 59 | 
            +
                            )
         | 
| 60 | 
            +
                        for output_path, is_speech in results:
         | 
| 61 | 
            +
                            if not is_speech:
         | 
| 62 | 
            +
                                filtered_files.append(output_path)
         | 
| 63 | 
            +
                    else:
         | 
| 64 | 
            +
                        for f in tqdm(files):
         | 
| 65 | 
            +
                            output_path, is_speech = adjust_path_and_remove_silence(f)
         | 
| 66 | 
            +
                            if not is_speech:
         | 
| 67 | 
            +
                                filtered_files.append(output_path)
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                    # write files that do not have speech
         | 
| 70 | 
            +
                    with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
         | 
| 71 | 
            +
                        for file in filtered_files:
         | 
| 72 | 
            +
                            f.write(str(file) + "\n")
         | 
| 73 | 
            +
                else:
         | 
| 74 | 
            +
                    print("> No files Found !")
         | 
| 75 | 
            +
             | 
| 76 | 
            +
             | 
| 77 | 
            +
            if __name__ == "__main__":
         | 
| 78 | 
            +
                parser = argparse.ArgumentParser(
         | 
| 79 | 
            +
                    description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
         | 
| 80 | 
            +
                )
         | 
| 81 | 
            +
                parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
         | 
| 82 | 
            +
                parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
         | 
| 83 | 
            +
                parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
         | 
| 84 | 
            +
                parser.add_argument(
         | 
| 85 | 
            +
                    "-g",
         | 
| 86 | 
            +
                    "--glob",
         | 
| 87 | 
            +
                    type=str,
         | 
| 88 | 
            +
                    default="**/*.wav",
         | 
| 89 | 
            +
                    help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
         | 
| 90 | 
            +
                )
         | 
| 91 | 
            +
                parser.add_argument(
         | 
| 92 | 
            +
                    "-t",
         | 
| 93 | 
            +
                    "--trim_just_beginning_and_end",
         | 
| 94 | 
            +
                    type=bool,
         | 
| 95 | 
            +
                    default=True,
         | 
| 96 | 
            +
                    help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
         | 
| 97 | 
            +
                )
         | 
| 98 | 
            +
                parser.add_argument(
         | 
| 99 | 
            +
                    "-c",
         | 
| 100 | 
            +
                    "--use_cuda",
         | 
| 101 | 
            +
                    type=bool,
         | 
| 102 | 
            +
                    default=False,
         | 
| 103 | 
            +
                    help="If True use cuda",
         | 
| 104 | 
            +
                )
         | 
| 105 | 
            +
                parser.add_argument(
         | 
| 106 | 
            +
                    "--use_onnx",
         | 
| 107 | 
            +
                    type=bool,
         | 
| 108 | 
            +
                    default=False,
         | 
| 109 | 
            +
                    help="If True use onnx",
         | 
| 110 | 
            +
                )
         | 
| 111 | 
            +
                parser.add_argument(
         | 
| 112 | 
            +
                    "--num_processes",
         | 
| 113 | 
            +
                    type=int,
         | 
| 114 | 
            +
                    default=1,
         | 
| 115 | 
            +
                    help="Number of processes to use",
         | 
| 116 | 
            +
                )
         | 
| 117 | 
            +
                args = parser.parse_args()
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                if args.output_dir == "":
         | 
| 120 | 
            +
                    args.output_dir = args.input_dir
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                # load the model and utils
         | 
| 123 | 
            +
                model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
         | 
| 124 | 
            +
                preprocess_audios()
         | 
    	
        TTS/bin/resample.py
    ADDED
    
    | @@ -0,0 +1,90 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import argparse
         | 
| 2 | 
            +
            import glob
         | 
| 3 | 
            +
            import os
         | 
| 4 | 
            +
            from argparse import RawTextHelpFormatter
         | 
| 5 | 
            +
            from multiprocessing import Pool
         | 
| 6 | 
            +
            from shutil import copytree
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            import librosa
         | 
| 9 | 
            +
            import soundfile as sf
         | 
| 10 | 
            +
            from tqdm import tqdm
         | 
| 11 | 
            +
             | 
| 12 | 
            +
             | 
| 13 | 
            +
            def resample_file(func_args):
         | 
| 14 | 
            +
                filename, output_sr = func_args
         | 
| 15 | 
            +
                y, sr = librosa.load(filename, sr=output_sr)
         | 
| 16 | 
            +
                sf.write(filename, y, sr)
         | 
| 17 | 
            +
             | 
| 18 | 
            +
             | 
| 19 | 
            +
            def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
         | 
| 20 | 
            +
                if output_dir:
         | 
| 21 | 
            +
                    print("Recursively copying the input folder...")
         | 
| 22 | 
            +
                    copytree(input_dir, output_dir)
         | 
| 23 | 
            +
                    input_dir = output_dir
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                print("Resampling the audio files...")
         | 
| 26 | 
            +
                audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
         | 
| 27 | 
            +
                print(f"Found {len(audio_files)} files...")
         | 
| 28 | 
            +
                audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
         | 
| 29 | 
            +
                with Pool(processes=n_jobs) as p:
         | 
| 30 | 
            +
                    with tqdm(total=len(audio_files)) as pbar:
         | 
| 31 | 
            +
                        for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
         | 
| 32 | 
            +
                            pbar.update()
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                print("Done !")
         | 
| 35 | 
            +
             | 
| 36 | 
            +
             | 
| 37 | 
            +
            if __name__ == "__main__":
         | 
| 38 | 
            +
                parser = argparse.ArgumentParser(
         | 
| 39 | 
            +
                    description="""Resample a folder recusively with librosa
         | 
| 40 | 
            +
                                   Can be used in place or create a copy of the folder as an output.\n\n
         | 
| 41 | 
            +
                                   Example run:
         | 
| 42 | 
            +
                                        python TTS/bin/resample.py
         | 
| 43 | 
            +
                                            --input_dir /root/LJSpeech-1.1/
         | 
| 44 | 
            +
                                            --output_sr 22050
         | 
| 45 | 
            +
                                            --output_dir /root/resampled_LJSpeech-1.1/
         | 
| 46 | 
            +
                                            --file_ext wav
         | 
| 47 | 
            +
                                            --n_jobs 24
         | 
| 48 | 
            +
                                """,
         | 
| 49 | 
            +
                    formatter_class=RawTextHelpFormatter,
         | 
| 50 | 
            +
                )
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                parser.add_argument(
         | 
| 53 | 
            +
                    "--input_dir",
         | 
| 54 | 
            +
                    type=str,
         | 
| 55 | 
            +
                    default=None,
         | 
| 56 | 
            +
                    required=True,
         | 
| 57 | 
            +
                    help="Path of the folder containing the audio files to resample",
         | 
| 58 | 
            +
                )
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                parser.add_argument(
         | 
| 61 | 
            +
                    "--output_sr",
         | 
| 62 | 
            +
                    type=int,
         | 
| 63 | 
            +
                    default=22050,
         | 
| 64 | 
            +
                    required=False,
         | 
| 65 | 
            +
                    help="Samlple rate to which the audio files should be resampled",
         | 
| 66 | 
            +
                )
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                parser.add_argument(
         | 
| 69 | 
            +
                    "--output_dir",
         | 
| 70 | 
            +
                    type=str,
         | 
| 71 | 
            +
                    default=None,
         | 
| 72 | 
            +
                    required=False,
         | 
| 73 | 
            +
                    help="Path of the destination folder. If not defined, the operation is done in place",
         | 
| 74 | 
            +
                )
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                parser.add_argument(
         | 
| 77 | 
            +
                    "--file_ext",
         | 
| 78 | 
            +
                    type=str,
         | 
| 79 | 
            +
                    default="wav",
         | 
| 80 | 
            +
                    required=False,
         | 
| 81 | 
            +
                    help="Extension of the audio files to resample",
         | 
| 82 | 
            +
                )
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                parser.add_argument(
         | 
| 85 | 
            +
                    "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
         | 
| 86 | 
            +
                )
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                args = parser.parse_args()
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
         | 
    	
        TTS/bin/synthesize.py
    ADDED
    
    | @@ -0,0 +1,541 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/usr/bin/env python3
         | 
| 2 | 
            +
            # -*- coding: utf-8 -*-
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import argparse
         | 
| 5 | 
            +
            import contextlib
         | 
| 6 | 
            +
            import sys
         | 
| 7 | 
            +
            from argparse import RawTextHelpFormatter
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            # pylint: disable=redefined-outer-name, unused-argument
         | 
| 10 | 
            +
            from pathlib import Path
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            description = """
         | 
| 13 | 
            +
            Synthesize speech on command line.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            You can either use your trained model or choose a model from the provided list.
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            If you don't specify any models, then it uses LJSpeech based English model.
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            #### Single Speaker Models
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            - List provided models:
         | 
| 22 | 
            +
             | 
| 23 | 
            +
              ```
         | 
| 24 | 
            +
              $ tts --list_models
         | 
| 25 | 
            +
              ```
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            - Get model info (for both tts_models and vocoder_models):
         | 
| 28 | 
            +
             | 
| 29 | 
            +
              - Query by type/name:
         | 
| 30 | 
            +
                The model_info_by_name uses the name as it from the --list_models.
         | 
| 31 | 
            +
                ```
         | 
| 32 | 
            +
                $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
         | 
| 33 | 
            +
                ```
         | 
| 34 | 
            +
                For example:
         | 
| 35 | 
            +
                ```
         | 
| 36 | 
            +
                $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
         | 
| 37 | 
            +
                $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
         | 
| 38 | 
            +
                ```
         | 
| 39 | 
            +
              - Query by type/idx:
         | 
| 40 | 
            +
                The model_query_idx uses the corresponding idx from --list_models.
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                ```
         | 
| 43 | 
            +
                $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
         | 
| 44 | 
            +
                ```
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                For example:
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                ```
         | 
| 49 | 
            +
                $ tts --model_info_by_idx tts_models/3
         | 
| 50 | 
            +
                ```
         | 
| 51 | 
            +
             | 
| 52 | 
            +
              - Query info for model info by full name:
         | 
| 53 | 
            +
                ```
         | 
| 54 | 
            +
                $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
         | 
| 55 | 
            +
                ```
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            - Run TTS with default models:
         | 
| 58 | 
            +
             | 
| 59 | 
            +
              ```
         | 
| 60 | 
            +
              $ tts --text "Text for TTS" --out_path output/path/speech.wav
         | 
| 61 | 
            +
              ```
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            - Run TTS and pipe out the generated TTS wav file data:
         | 
| 64 | 
            +
             | 
| 65 | 
            +
              ```
         | 
| 66 | 
            +
              $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
         | 
| 67 | 
            +
              ```
         | 
| 68 | 
            +
             | 
| 69 | 
            +
            - Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
         | 
| 70 | 
            +
             | 
| 71 | 
            +
              ```
         | 
| 72 | 
            +
              $ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
         | 
| 73 | 
            +
              ```
         | 
| 74 | 
            +
             | 
| 75 | 
            +
            - Run a TTS model with its default vocoder model:
         | 
| 76 | 
            +
             | 
| 77 | 
            +
              ```
         | 
| 78 | 
            +
              $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
         | 
| 79 | 
            +
              ```
         | 
| 80 | 
            +
             | 
| 81 | 
            +
              For example:
         | 
| 82 | 
            +
             | 
| 83 | 
            +
              ```
         | 
| 84 | 
            +
              $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
         | 
| 85 | 
            +
              ```
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            - Run with specific TTS and vocoder models from the list:
         | 
| 88 | 
            +
             | 
| 89 | 
            +
              ```
         | 
| 90 | 
            +
              $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
         | 
| 91 | 
            +
              ```
         | 
| 92 | 
            +
             | 
| 93 | 
            +
              For example:
         | 
| 94 | 
            +
             | 
| 95 | 
            +
              ```
         | 
| 96 | 
            +
              $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
         | 
| 97 | 
            +
              ```
         | 
| 98 | 
            +
             | 
| 99 | 
            +
            - Run your own TTS model (Using Griffin-Lim Vocoder):
         | 
| 100 | 
            +
             | 
| 101 | 
            +
              ```
         | 
| 102 | 
            +
              $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
         | 
| 103 | 
            +
              ```
         | 
| 104 | 
            +
             | 
| 105 | 
            +
            - Run your own TTS and Vocoder models:
         | 
| 106 | 
            +
             | 
| 107 | 
            +
              ```
         | 
| 108 | 
            +
              $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
         | 
| 109 | 
            +
                  --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
         | 
| 110 | 
            +
              ```
         | 
| 111 | 
            +
             | 
| 112 | 
            +
            #### Multi-speaker Models
         | 
| 113 | 
            +
             | 
| 114 | 
            +
            - List the available speakers and choose a <speaker_id> among them:
         | 
| 115 | 
            +
             | 
| 116 | 
            +
              ```
         | 
| 117 | 
            +
              $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
         | 
| 118 | 
            +
              ```
         | 
| 119 | 
            +
             | 
| 120 | 
            +
            - Run the multi-speaker TTS model with the target speaker ID:
         | 
| 121 | 
            +
             | 
| 122 | 
            +
              ```
         | 
| 123 | 
            +
              $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
         | 
| 124 | 
            +
              ```
         | 
| 125 | 
            +
             | 
| 126 | 
            +
            - Run your own multi-speaker TTS model:
         | 
| 127 | 
            +
             | 
| 128 | 
            +
              ```
         | 
| 129 | 
            +
              $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
         | 
| 130 | 
            +
              ```
         | 
| 131 | 
            +
             | 
| 132 | 
            +
            ### Voice Conversion Models
         | 
| 133 | 
            +
             | 
| 134 | 
            +
            ```
         | 
| 135 | 
            +
            $ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
         | 
| 136 | 
            +
            ```
         | 
| 137 | 
            +
            """
         | 
| 138 | 
            +
             | 
| 139 | 
            +
             | 
| 140 | 
            +
            def str2bool(v):
         | 
| 141 | 
            +
                if isinstance(v, bool):
         | 
| 142 | 
            +
                    return v
         | 
| 143 | 
            +
                if v.lower() in ("yes", "true", "t", "y", "1"):
         | 
| 144 | 
            +
                    return True
         | 
| 145 | 
            +
                if v.lower() in ("no", "false", "f", "n", "0"):
         | 
| 146 | 
            +
                    return False
         | 
| 147 | 
            +
                raise argparse.ArgumentTypeError("Boolean value expected.")
         | 
| 148 | 
            +
             | 
| 149 | 
            +
             | 
| 150 | 
            +
            def main():
         | 
| 151 | 
            +
                parser = argparse.ArgumentParser(
         | 
| 152 | 
            +
                    description=description.replace("    ```\n", ""),
         | 
| 153 | 
            +
                    formatter_class=RawTextHelpFormatter,
         | 
| 154 | 
            +
                )
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                parser.add_argument(
         | 
| 157 | 
            +
                    "--list_models",
         | 
| 158 | 
            +
                    type=str2bool,
         | 
| 159 | 
            +
                    nargs="?",
         | 
| 160 | 
            +
                    const=True,
         | 
| 161 | 
            +
                    default=False,
         | 
| 162 | 
            +
                    help="list available pre-trained TTS and vocoder models.",
         | 
| 163 | 
            +
                )
         | 
| 164 | 
            +
             | 
| 165 | 
            +
                parser.add_argument(
         | 
| 166 | 
            +
                    "--model_info_by_idx",
         | 
| 167 | 
            +
                    type=str,
         | 
| 168 | 
            +
                    default=None,
         | 
| 169 | 
            +
                    help="model info using query format: <model_type>/<model_query_idx>",
         | 
| 170 | 
            +
                )
         | 
| 171 | 
            +
             | 
| 172 | 
            +
                parser.add_argument(
         | 
| 173 | 
            +
                    "--model_info_by_name",
         | 
| 174 | 
            +
                    type=str,
         | 
| 175 | 
            +
                    default=None,
         | 
| 176 | 
            +
                    help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
         | 
| 177 | 
            +
                )
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
         | 
| 180 | 
            +
             | 
| 181 | 
            +
                # Args for running pre-trained TTS models.
         | 
| 182 | 
            +
                parser.add_argument(
         | 
| 183 | 
            +
                    "--model_name",
         | 
| 184 | 
            +
                    type=str,
         | 
| 185 | 
            +
                    default="tts_models/en/ljspeech/tacotron2-DDC",
         | 
| 186 | 
            +
                    help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
         | 
| 187 | 
            +
                )
         | 
| 188 | 
            +
                parser.add_argument(
         | 
| 189 | 
            +
                    "--vocoder_name",
         | 
| 190 | 
            +
                    type=str,
         | 
| 191 | 
            +
                    default=None,
         | 
| 192 | 
            +
                    help="Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>",
         | 
| 193 | 
            +
                )
         | 
| 194 | 
            +
             | 
| 195 | 
            +
                # Args for running custom models
         | 
| 196 | 
            +
                parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
         | 
| 197 | 
            +
                parser.add_argument(
         | 
| 198 | 
            +
                    "--model_path",
         | 
| 199 | 
            +
                    type=str,
         | 
| 200 | 
            +
                    default=None,
         | 
| 201 | 
            +
                    help="Path to model file.",
         | 
| 202 | 
            +
                )
         | 
| 203 | 
            +
                parser.add_argument(
         | 
| 204 | 
            +
                    "--out_path",
         | 
| 205 | 
            +
                    type=str,
         | 
| 206 | 
            +
                    default="tts_output.wav",
         | 
| 207 | 
            +
                    help="Output wav file path.",
         | 
| 208 | 
            +
                )
         | 
| 209 | 
            +
                parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
         | 
| 210 | 
            +
                parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
         | 
| 211 | 
            +
                parser.add_argument(
         | 
| 212 | 
            +
                    "--vocoder_path",
         | 
| 213 | 
            +
                    type=str,
         | 
| 214 | 
            +
                    help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
         | 
| 215 | 
            +
                    default=None,
         | 
| 216 | 
            +
                )
         | 
| 217 | 
            +
                parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
         | 
| 218 | 
            +
                parser.add_argument(
         | 
| 219 | 
            +
                    "--encoder_path",
         | 
| 220 | 
            +
                    type=str,
         | 
| 221 | 
            +
                    help="Path to speaker encoder model file.",
         | 
| 222 | 
            +
                    default=None,
         | 
| 223 | 
            +
                )
         | 
| 224 | 
            +
                parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
         | 
| 225 | 
            +
             | 
| 226 | 
            +
                # args for coqui studio
         | 
| 227 | 
            +
                parser.add_argument(
         | 
| 228 | 
            +
                    "--cs_model",
         | 
| 229 | 
            +
                    type=str,
         | 
| 230 | 
            +
                    help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
         | 
| 231 | 
            +
                )
         | 
| 232 | 
            +
                parser.add_argument(
         | 
| 233 | 
            +
                    "--emotion",
         | 
| 234 | 
            +
                    type=str,
         | 
| 235 | 
            +
                    help="Emotion to condition the model with. Only available for 🐸Coqui Studio `V1` model.",
         | 
| 236 | 
            +
                    default=None,
         | 
| 237 | 
            +
                )
         | 
| 238 | 
            +
                parser.add_argument(
         | 
| 239 | 
            +
                    "--language",
         | 
| 240 | 
            +
                    type=str,
         | 
| 241 | 
            +
                    help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
         | 
| 242 | 
            +
                    default=None,
         | 
| 243 | 
            +
                )
         | 
| 244 | 
            +
                parser.add_argument(
         | 
| 245 | 
            +
                    "--pipe_out",
         | 
| 246 | 
            +
                    help="stdout the generated TTS wav file for shell pipe.",
         | 
| 247 | 
            +
                    type=str2bool,
         | 
| 248 | 
            +
                    nargs="?",
         | 
| 249 | 
            +
                    const=True,
         | 
| 250 | 
            +
                    default=False,
         | 
| 251 | 
            +
                )
         | 
| 252 | 
            +
                parser.add_argument(
         | 
| 253 | 
            +
                    "--speed",
         | 
| 254 | 
            +
                    type=float,
         | 
| 255 | 
            +
                    help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.",
         | 
| 256 | 
            +
                    default=None,
         | 
| 257 | 
            +
                )
         | 
| 258 | 
            +
             | 
| 259 | 
            +
                # args for multi-speaker synthesis
         | 
| 260 | 
            +
                parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
         | 
| 261 | 
            +
                parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
         | 
| 262 | 
            +
                parser.add_argument(
         | 
| 263 | 
            +
                    "--speaker_idx",
         | 
| 264 | 
            +
                    type=str,
         | 
| 265 | 
            +
                    help="Target speaker ID for a multi-speaker TTS model.",
         | 
| 266 | 
            +
                    default=None,
         | 
| 267 | 
            +
                )
         | 
| 268 | 
            +
                parser.add_argument(
         | 
| 269 | 
            +
                    "--language_idx",
         | 
| 270 | 
            +
                    type=str,
         | 
| 271 | 
            +
                    help="Target language ID for a multi-lingual TTS model.",
         | 
| 272 | 
            +
                    default=None,
         | 
| 273 | 
            +
                )
         | 
| 274 | 
            +
                parser.add_argument(
         | 
| 275 | 
            +
                    "--speaker_wav",
         | 
| 276 | 
            +
                    nargs="+",
         | 
| 277 | 
            +
                    help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
         | 
| 278 | 
            +
                    default=None,
         | 
| 279 | 
            +
                )
         | 
| 280 | 
            +
                parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
         | 
| 281 | 
            +
                parser.add_argument(
         | 
| 282 | 
            +
                    "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
         | 
| 283 | 
            +
                )
         | 
| 284 | 
            +
                parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
         | 
| 285 | 
            +
                parser.add_argument(
         | 
| 286 | 
            +
                    "--list_speaker_idxs",
         | 
| 287 | 
            +
                    help="List available speaker ids for the defined multi-speaker model.",
         | 
| 288 | 
            +
                    type=str2bool,
         | 
| 289 | 
            +
                    nargs="?",
         | 
| 290 | 
            +
                    const=True,
         | 
| 291 | 
            +
                    default=False,
         | 
| 292 | 
            +
                )
         | 
| 293 | 
            +
                parser.add_argument(
         | 
| 294 | 
            +
                    "--list_language_idxs",
         | 
| 295 | 
            +
                    help="List available language ids for the defined multi-lingual model.",
         | 
| 296 | 
            +
                    type=str2bool,
         | 
| 297 | 
            +
                    nargs="?",
         | 
| 298 | 
            +
                    const=True,
         | 
| 299 | 
            +
                    default=False,
         | 
| 300 | 
            +
                )
         | 
| 301 | 
            +
                # aux args
         | 
| 302 | 
            +
                parser.add_argument(
         | 
| 303 | 
            +
                    "--save_spectogram",
         | 
| 304 | 
            +
                    type=bool,
         | 
| 305 | 
            +
                    help="If true save raw spectogram for further (vocoder) processing in out_path.",
         | 
| 306 | 
            +
                    default=False,
         | 
| 307 | 
            +
                )
         | 
| 308 | 
            +
                parser.add_argument(
         | 
| 309 | 
            +
                    "--reference_wav",
         | 
| 310 | 
            +
                    type=str,
         | 
| 311 | 
            +
                    help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
         | 
| 312 | 
            +
                    default=None,
         | 
| 313 | 
            +
                )
         | 
| 314 | 
            +
                parser.add_argument(
         | 
| 315 | 
            +
                    "--reference_speaker_idx",
         | 
| 316 | 
            +
                    type=str,
         | 
| 317 | 
            +
                    help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
         | 
| 318 | 
            +
                    default=None,
         | 
| 319 | 
            +
                )
         | 
| 320 | 
            +
                parser.add_argument(
         | 
| 321 | 
            +
                    "--progress_bar",
         | 
| 322 | 
            +
                    type=str2bool,
         | 
| 323 | 
            +
                    help="If true shows a progress bar for the model download. Defaults to True",
         | 
| 324 | 
            +
                    default=True,
         | 
| 325 | 
            +
                )
         | 
| 326 | 
            +
             | 
| 327 | 
            +
                # voice conversion args
         | 
| 328 | 
            +
                parser.add_argument(
         | 
| 329 | 
            +
                    "--source_wav",
         | 
| 330 | 
            +
                    type=str,
         | 
| 331 | 
            +
                    default=None,
         | 
| 332 | 
            +
                    help="Original audio file to convert in the voice of the target_wav",
         | 
| 333 | 
            +
                )
         | 
| 334 | 
            +
                parser.add_argument(
         | 
| 335 | 
            +
                    "--target_wav",
         | 
| 336 | 
            +
                    type=str,
         | 
| 337 | 
            +
                    default=None,
         | 
| 338 | 
            +
                    help="Target audio file to convert in the voice of the source_wav",
         | 
| 339 | 
            +
                )
         | 
| 340 | 
            +
             | 
| 341 | 
            +
                parser.add_argument(
         | 
| 342 | 
            +
                    "--voice_dir",
         | 
| 343 | 
            +
                    type=str,
         | 
| 344 | 
            +
                    default=None,
         | 
| 345 | 
            +
                    help="Voice dir for tortoise model",
         | 
| 346 | 
            +
                )
         | 
| 347 | 
            +
             | 
| 348 | 
            +
                args = parser.parse_args()
         | 
| 349 | 
            +
             | 
| 350 | 
            +
                # print the description if either text or list_models is not set
         | 
| 351 | 
            +
                check_args = [
         | 
| 352 | 
            +
                    args.text,
         | 
| 353 | 
            +
                    args.list_models,
         | 
| 354 | 
            +
                    args.list_speaker_idxs,
         | 
| 355 | 
            +
                    args.list_language_idxs,
         | 
| 356 | 
            +
                    args.reference_wav,
         | 
| 357 | 
            +
                    args.model_info_by_idx,
         | 
| 358 | 
            +
                    args.model_info_by_name,
         | 
| 359 | 
            +
                    args.source_wav,
         | 
| 360 | 
            +
                    args.target_wav,
         | 
| 361 | 
            +
                ]
         | 
| 362 | 
            +
                if not any(check_args):
         | 
| 363 | 
            +
                    parser.parse_args(["-h"])
         | 
| 364 | 
            +
             | 
| 365 | 
            +
                pipe_out = sys.stdout if args.pipe_out else None
         | 
| 366 | 
            +
             | 
| 367 | 
            +
                with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
         | 
| 368 | 
            +
                    # Late-import to make things load faster
         | 
| 369 | 
            +
                    from TTS.api import TTS
         | 
| 370 | 
            +
                    from TTS.utils.manage import ModelManager
         | 
| 371 | 
            +
                    from TTS.utils.synthesizer import Synthesizer
         | 
| 372 | 
            +
             | 
| 373 | 
            +
                    # load model manager
         | 
| 374 | 
            +
                    path = Path(__file__).parent / "../.models.json"
         | 
| 375 | 
            +
                    manager = ModelManager(path, progress_bar=args.progress_bar)
         | 
| 376 | 
            +
                    api = TTS()
         | 
| 377 | 
            +
             | 
| 378 | 
            +
                    tts_path = None
         | 
| 379 | 
            +
                    tts_config_path = None
         | 
| 380 | 
            +
                    speakers_file_path = None
         | 
| 381 | 
            +
                    language_ids_file_path = None
         | 
| 382 | 
            +
                    vocoder_path = None
         | 
| 383 | 
            +
                    vocoder_config_path = None
         | 
| 384 | 
            +
                    encoder_path = None
         | 
| 385 | 
            +
                    encoder_config_path = None
         | 
| 386 | 
            +
                    vc_path = None
         | 
| 387 | 
            +
                    vc_config_path = None
         | 
| 388 | 
            +
                    model_dir = None
         | 
| 389 | 
            +
             | 
| 390 | 
            +
                    # CASE1 #list : list pre-trained TTS models
         | 
| 391 | 
            +
                    if args.list_models:
         | 
| 392 | 
            +
                        manager.add_cs_api_models(api.list_models())
         | 
| 393 | 
            +
                        manager.list_models()
         | 
| 394 | 
            +
                        sys.exit()
         | 
| 395 | 
            +
             | 
| 396 | 
            +
                    # CASE2 #info : model info for pre-trained TTS models
         | 
| 397 | 
            +
                    if args.model_info_by_idx:
         | 
| 398 | 
            +
                        model_query = args.model_info_by_idx
         | 
| 399 | 
            +
                        manager.model_info_by_idx(model_query)
         | 
| 400 | 
            +
                        sys.exit()
         | 
| 401 | 
            +
             | 
| 402 | 
            +
                    if args.model_info_by_name:
         | 
| 403 | 
            +
                        model_query_full_name = args.model_info_by_name
         | 
| 404 | 
            +
                        manager.model_info_by_full_name(model_query_full_name)
         | 
| 405 | 
            +
                        sys.exit()
         | 
| 406 | 
            +
             | 
| 407 | 
            +
                    # CASE3: TTS with coqui studio models
         | 
| 408 | 
            +
                    if "coqui_studio" in args.model_name:
         | 
| 409 | 
            +
                        print(" > Using 🐸Coqui Studio model: ", args.model_name)
         | 
| 410 | 
            +
                        api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
         | 
| 411 | 
            +
                        api.tts_to_file(
         | 
| 412 | 
            +
                            text=args.text,
         | 
| 413 | 
            +
                            emotion=args.emotion,
         | 
| 414 | 
            +
                            file_path=args.out_path,
         | 
| 415 | 
            +
                            language=args.language,
         | 
| 416 | 
            +
                            speed=args.speed,
         | 
| 417 | 
            +
                            pipe_out=pipe_out,
         | 
| 418 | 
            +
                        )
         | 
| 419 | 
            +
                        print(" > Saving output to ", args.out_path)
         | 
| 420 | 
            +
                        return
         | 
| 421 | 
            +
             | 
| 422 | 
            +
                    # CASE4: load pre-trained model paths
         | 
| 423 | 
            +
                    if args.model_name is not None and not args.model_path:
         | 
| 424 | 
            +
                        model_path, config_path, model_item = manager.download_model(args.model_name)
         | 
| 425 | 
            +
                        # tts model
         | 
| 426 | 
            +
                        if model_item["model_type"] == "tts_models":
         | 
| 427 | 
            +
                            tts_path = model_path
         | 
| 428 | 
            +
                            tts_config_path = config_path
         | 
| 429 | 
            +
                            if "default_vocoder" in model_item:
         | 
| 430 | 
            +
                                args.vocoder_name = (
         | 
| 431 | 
            +
                                    model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
         | 
| 432 | 
            +
                                )
         | 
| 433 | 
            +
             | 
| 434 | 
            +
                        # voice conversion model
         | 
| 435 | 
            +
                        if model_item["model_type"] == "voice_conversion_models":
         | 
| 436 | 
            +
                            vc_path = model_path
         | 
| 437 | 
            +
                            vc_config_path = config_path
         | 
| 438 | 
            +
             | 
| 439 | 
            +
                        # tts model with multiple files to be loaded from the directory path
         | 
| 440 | 
            +
                        if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
         | 
| 441 | 
            +
                            model_dir = model_path
         | 
| 442 | 
            +
                            tts_path = None
         | 
| 443 | 
            +
                            tts_config_path = None
         | 
| 444 | 
            +
                            args.vocoder_name = None
         | 
| 445 | 
            +
             | 
| 446 | 
            +
                    # load vocoder
         | 
| 447 | 
            +
                    if args.vocoder_name is not None and not args.vocoder_path:
         | 
| 448 | 
            +
                        vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
         | 
| 449 | 
            +
             | 
| 450 | 
            +
                    # CASE5: set custom model paths
         | 
| 451 | 
            +
                    if args.model_path is not None:
         | 
| 452 | 
            +
                        tts_path = args.model_path
         | 
| 453 | 
            +
                        tts_config_path = args.config_path
         | 
| 454 | 
            +
                        speakers_file_path = args.speakers_file_path
         | 
| 455 | 
            +
                        language_ids_file_path = args.language_ids_file_path
         | 
| 456 | 
            +
             | 
| 457 | 
            +
                    if args.vocoder_path is not None:
         | 
| 458 | 
            +
                        vocoder_path = args.vocoder_path
         | 
| 459 | 
            +
                        vocoder_config_path = args.vocoder_config_path
         | 
| 460 | 
            +
             | 
| 461 | 
            +
                    if args.encoder_path is not None:
         | 
| 462 | 
            +
                        encoder_path = args.encoder_path
         | 
| 463 | 
            +
                        encoder_config_path = args.encoder_config_path
         | 
| 464 | 
            +
             | 
| 465 | 
            +
                    device = args.device
         | 
| 466 | 
            +
                    if args.use_cuda:
         | 
| 467 | 
            +
                        device = "cuda"
         | 
| 468 | 
            +
             | 
| 469 | 
            +
                    # load models
         | 
| 470 | 
            +
                    synthesizer = Synthesizer(
         | 
| 471 | 
            +
                        tts_path,
         | 
| 472 | 
            +
                        tts_config_path,
         | 
| 473 | 
            +
                        speakers_file_path,
         | 
| 474 | 
            +
                        language_ids_file_path,
         | 
| 475 | 
            +
                        vocoder_path,
         | 
| 476 | 
            +
                        vocoder_config_path,
         | 
| 477 | 
            +
                        encoder_path,
         | 
| 478 | 
            +
                        encoder_config_path,
         | 
| 479 | 
            +
                        vc_path,
         | 
| 480 | 
            +
                        vc_config_path,
         | 
| 481 | 
            +
                        model_dir,
         | 
| 482 | 
            +
                        args.voice_dir,
         | 
| 483 | 
            +
                    ).to(device)
         | 
| 484 | 
            +
             | 
| 485 | 
            +
                    # query speaker ids of a multi-speaker model.
         | 
| 486 | 
            +
                    if args.list_speaker_idxs:
         | 
| 487 | 
            +
                        print(
         | 
| 488 | 
            +
                            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
         | 
| 489 | 
            +
                        )
         | 
| 490 | 
            +
                        print(synthesizer.tts_model.speaker_manager.name_to_id)
         | 
| 491 | 
            +
                        return
         | 
| 492 | 
            +
             | 
| 493 | 
            +
                    # query langauge ids of a multi-lingual model.
         | 
| 494 | 
            +
                    if args.list_language_idxs:
         | 
| 495 | 
            +
                        print(
         | 
| 496 | 
            +
                            " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
         | 
| 497 | 
            +
                        )
         | 
| 498 | 
            +
                        print(synthesizer.tts_model.language_manager.name_to_id)
         | 
| 499 | 
            +
                        return
         | 
| 500 | 
            +
             | 
| 501 | 
            +
                    # check the arguments against a multi-speaker model.
         | 
| 502 | 
            +
                    if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
         | 
| 503 | 
            +
                        print(
         | 
| 504 | 
            +
                            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
         | 
| 505 | 
            +
                            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
         | 
| 506 | 
            +
                        )
         | 
| 507 | 
            +
                        return
         | 
| 508 | 
            +
             | 
| 509 | 
            +
                    # RUN THE SYNTHESIS
         | 
| 510 | 
            +
                    if args.text:
         | 
| 511 | 
            +
                        print(" > Text: {}".format(args.text))
         | 
| 512 | 
            +
             | 
| 513 | 
            +
                    # kick it
         | 
| 514 | 
            +
                    if tts_path is not None:
         | 
| 515 | 
            +
                        wav = synthesizer.tts(
         | 
| 516 | 
            +
                            args.text,
         | 
| 517 | 
            +
                            speaker_name=args.speaker_idx,
         | 
| 518 | 
            +
                            language_name=args.language_idx,
         | 
| 519 | 
            +
                            speaker_wav=args.speaker_wav,
         | 
| 520 | 
            +
                            reference_wav=args.reference_wav,
         | 
| 521 | 
            +
                            style_wav=args.capacitron_style_wav,
         | 
| 522 | 
            +
                            style_text=args.capacitron_style_text,
         | 
| 523 | 
            +
                            reference_speaker_name=args.reference_speaker_idx,
         | 
| 524 | 
            +
                        )
         | 
| 525 | 
            +
                    elif vc_path is not None:
         | 
| 526 | 
            +
                        wav = synthesizer.voice_conversion(
         | 
| 527 | 
            +
                            source_wav=args.source_wav,
         | 
| 528 | 
            +
                            target_wav=args.target_wav,
         | 
| 529 | 
            +
                        )
         | 
| 530 | 
            +
                    elif model_dir is not None:
         | 
| 531 | 
            +
                        wav = synthesizer.tts(
         | 
| 532 | 
            +
                            args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
         | 
| 533 | 
            +
                        )
         | 
| 534 | 
            +
             | 
| 535 | 
            +
                    # save the results
         | 
| 536 | 
            +
                    print(" > Saving output to {}".format(args.out_path))
         | 
| 537 | 
            +
                    synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
         | 
| 538 | 
            +
             | 
| 539 | 
            +
             | 
| 540 | 
            +
            if __name__ == "__main__":
         | 
| 541 | 
            +
                main()
         | 
    	
        TTS/bin/train_encoder.py
    ADDED
    
    | @@ -0,0 +1,319 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/usr/bin/env python3
         | 
| 2 | 
            +
            # -*- coding: utf-8 -*-
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import os
         | 
| 5 | 
            +
            import sys
         | 
| 6 | 
            +
            import time
         | 
| 7 | 
            +
            import traceback
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            import torch
         | 
| 10 | 
            +
            from torch.utils.data import DataLoader
         | 
| 11 | 
            +
            from trainer.torch import NoamLR
         | 
| 12 | 
            +
            from trainer.trainer_utils import get_optimizer
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            from TTS.encoder.dataset import EncoderDataset
         | 
| 15 | 
            +
            from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
         | 
| 16 | 
            +
            from TTS.encoder.utils.training import init_training
         | 
| 17 | 
            +
            from TTS.encoder.utils.visual import plot_embeddings
         | 
| 18 | 
            +
            from TTS.tts.datasets import load_tts_samples
         | 
| 19 | 
            +
            from TTS.utils.audio import AudioProcessor
         | 
| 20 | 
            +
            from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
         | 
| 21 | 
            +
            from TTS.utils.io import copy_model_files
         | 
| 22 | 
            +
            from TTS.utils.samplers import PerfectBatchSampler
         | 
| 23 | 
            +
            from TTS.utils.training import check_update
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            torch.backends.cudnn.enabled = True
         | 
| 26 | 
            +
            torch.backends.cudnn.benchmark = True
         | 
| 27 | 
            +
            torch.manual_seed(54321)
         | 
| 28 | 
            +
            use_cuda = torch.cuda.is_available()
         | 
| 29 | 
            +
            num_gpus = torch.cuda.device_count()
         | 
| 30 | 
            +
            print(" > Using CUDA: ", use_cuda)
         | 
| 31 | 
            +
            print(" > Number of GPUs: ", num_gpus)
         | 
| 32 | 
            +
             | 
| 33 | 
            +
             | 
| 34 | 
            +
            def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
         | 
| 35 | 
            +
                num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
         | 
| 36 | 
            +
                num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                dataset = EncoderDataset(
         | 
| 39 | 
            +
                    c,
         | 
| 40 | 
            +
                    ap,
         | 
| 41 | 
            +
                    meta_data_eval if is_val else meta_data_train,
         | 
| 42 | 
            +
                    voice_len=c.voice_len,
         | 
| 43 | 
            +
                    num_utter_per_class=num_utter_per_class,
         | 
| 44 | 
            +
                    num_classes_in_batch=num_classes_in_batch,
         | 
| 45 | 
            +
                    verbose=verbose,
         | 
| 46 | 
            +
                    augmentation_config=c.audio_augmentation if not is_val else None,
         | 
| 47 | 
            +
                    use_torch_spec=c.model_params.get("use_torch_spec", False),
         | 
| 48 | 
            +
                )
         | 
| 49 | 
            +
                # get classes list
         | 
| 50 | 
            +
                classes = dataset.get_class_list()
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                sampler = PerfectBatchSampler(
         | 
| 53 | 
            +
                    dataset.items,
         | 
| 54 | 
            +
                    classes,
         | 
| 55 | 
            +
                    batch_size=num_classes_in_batch * num_utter_per_class,  # total batch size
         | 
| 56 | 
            +
                    num_classes_in_batch=num_classes_in_batch,
         | 
| 57 | 
            +
                    num_gpus=1,
         | 
| 58 | 
            +
                    shuffle=not is_val,
         | 
| 59 | 
            +
                    drop_last=True,
         | 
| 60 | 
            +
                )
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                if len(classes) < num_classes_in_batch:
         | 
| 63 | 
            +
                    if is_val:
         | 
| 64 | 
            +
                        raise RuntimeError(
         | 
| 65 | 
            +
                            f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
         | 
| 66 | 
            +
                        )
         | 
| 67 | 
            +
                    raise RuntimeError(
         | 
| 68 | 
            +
                        f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
         | 
| 69 | 
            +
                    )
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
         | 
| 72 | 
            +
                if is_val:
         | 
| 73 | 
            +
                    dataset.set_classes(train_classes)
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                loader = DataLoader(
         | 
| 76 | 
            +
                    dataset,
         | 
| 77 | 
            +
                    num_workers=c.num_loader_workers,
         | 
| 78 | 
            +
                    batch_sampler=sampler,
         | 
| 79 | 
            +
                    collate_fn=dataset.collate_fn,
         | 
| 80 | 
            +
                )
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                return loader, classes, dataset.get_map_classid_to_classname()
         | 
| 83 | 
            +
             | 
| 84 | 
            +
             | 
| 85 | 
            +
            def evaluation(model, criterion, data_loader, global_step):
         | 
| 86 | 
            +
                eval_loss = 0
         | 
| 87 | 
            +
                for _, data in enumerate(data_loader):
         | 
| 88 | 
            +
                    with torch.no_grad():
         | 
| 89 | 
            +
                        # setup input data
         | 
| 90 | 
            +
                        inputs, labels = data
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                        # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
         | 
| 93 | 
            +
                        labels = torch.transpose(
         | 
| 94 | 
            +
                            labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
         | 
| 95 | 
            +
                        ).reshape(labels.shape)
         | 
| 96 | 
            +
                        inputs = torch.transpose(
         | 
| 97 | 
            +
                            inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
         | 
| 98 | 
            +
                        ).reshape(inputs.shape)
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                        # dispatch data to GPU
         | 
| 101 | 
            +
                        if use_cuda:
         | 
| 102 | 
            +
                            inputs = inputs.cuda(non_blocking=True)
         | 
| 103 | 
            +
                            labels = labels.cuda(non_blocking=True)
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                        # forward pass model
         | 
| 106 | 
            +
                        outputs = model(inputs)
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                        # loss computation
         | 
| 109 | 
            +
                        loss = criterion(
         | 
| 110 | 
            +
                            outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
         | 
| 111 | 
            +
                        )
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                        eval_loss += loss.item()
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                eval_avg_loss = eval_loss / len(data_loader)
         | 
| 116 | 
            +
                # save stats
         | 
| 117 | 
            +
                dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
         | 
| 118 | 
            +
                # plot the last batch in the evaluation
         | 
| 119 | 
            +
                figures = {
         | 
| 120 | 
            +
                    "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
         | 
| 121 | 
            +
                }
         | 
| 122 | 
            +
                dashboard_logger.eval_figures(global_step, figures)
         | 
| 123 | 
            +
                return eval_avg_loss
         | 
| 124 | 
            +
             | 
| 125 | 
            +
             | 
| 126 | 
            +
            def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
         | 
| 127 | 
            +
                model.train()
         | 
| 128 | 
            +
                best_loss = float("inf")
         | 
| 129 | 
            +
                avg_loader_time = 0
         | 
| 130 | 
            +
                end_time = time.time()
         | 
| 131 | 
            +
                for epoch in range(c.epochs):
         | 
| 132 | 
            +
                    tot_loss = 0
         | 
| 133 | 
            +
                    epoch_time = 0
         | 
| 134 | 
            +
                    for _, data in enumerate(data_loader):
         | 
| 135 | 
            +
                        start_time = time.time()
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                        # setup input data
         | 
| 138 | 
            +
                        inputs, labels = data
         | 
| 139 | 
            +
                        # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
         | 
| 140 | 
            +
                        labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
         | 
| 141 | 
            +
                            labels.shape
         | 
| 142 | 
            +
                        )
         | 
| 143 | 
            +
                        inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
         | 
| 144 | 
            +
                            inputs.shape
         | 
| 145 | 
            +
                        )
         | 
| 146 | 
            +
                        # ToDo: move it to a unit test
         | 
| 147 | 
            +
                        # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
         | 
| 148 | 
            +
                        # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
         | 
| 149 | 
            +
                        # idx = 0
         | 
| 150 | 
            +
                        # for j in range(0, c.num_classes_in_batch, 1):
         | 
| 151 | 
            +
                        #     for i in range(j, len(labels), c.num_classes_in_batch):
         | 
| 152 | 
            +
                        #         if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
         | 
| 153 | 
            +
                        #             print("Invalid")
         | 
| 154 | 
            +
                        #             print(labels)
         | 
| 155 | 
            +
                        #             exit()
         | 
| 156 | 
            +
                        #         idx += 1
         | 
| 157 | 
            +
                        # labels = labels_converted
         | 
| 158 | 
            +
                        # inputs = inputs_converted
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                        loader_time = time.time() - end_time
         | 
| 161 | 
            +
                        global_step += 1
         | 
| 162 | 
            +
             | 
| 163 | 
            +
                        # setup lr
         | 
| 164 | 
            +
                        if c.lr_decay:
         | 
| 165 | 
            +
                            scheduler.step()
         | 
| 166 | 
            +
                        optimizer.zero_grad()
         | 
| 167 | 
            +
             | 
| 168 | 
            +
                        # dispatch data to GPU
         | 
| 169 | 
            +
                        if use_cuda:
         | 
| 170 | 
            +
                            inputs = inputs.cuda(non_blocking=True)
         | 
| 171 | 
            +
                            labels = labels.cuda(non_blocking=True)
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                        # forward pass model
         | 
| 174 | 
            +
                        outputs = model(inputs)
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                        # loss computation
         | 
| 177 | 
            +
                        loss = criterion(
         | 
| 178 | 
            +
                            outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
         | 
| 179 | 
            +
                        )
         | 
| 180 | 
            +
                        loss.backward()
         | 
| 181 | 
            +
                        grad_norm, _ = check_update(model, c.grad_clip)
         | 
| 182 | 
            +
                        optimizer.step()
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                        step_time = time.time() - start_time
         | 
| 185 | 
            +
                        epoch_time += step_time
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                        # acumulate the total epoch loss
         | 
| 188 | 
            +
                        tot_loss += loss.item()
         | 
| 189 | 
            +
             | 
| 190 | 
            +
                        # Averaged Loader Time
         | 
| 191 | 
            +
                        num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
         | 
| 192 | 
            +
                        avg_loader_time = (
         | 
| 193 | 
            +
                            1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
         | 
| 194 | 
            +
                            if avg_loader_time != 0
         | 
| 195 | 
            +
                            else loader_time
         | 
| 196 | 
            +
                        )
         | 
| 197 | 
            +
                        current_lr = optimizer.param_groups[0]["lr"]
         | 
| 198 | 
            +
             | 
| 199 | 
            +
                        if global_step % c.steps_plot_stats == 0:
         | 
| 200 | 
            +
                            # Plot Training Epoch Stats
         | 
| 201 | 
            +
                            train_stats = {
         | 
| 202 | 
            +
                                "loss": loss.item(),
         | 
| 203 | 
            +
                                "lr": current_lr,
         | 
| 204 | 
            +
                                "grad_norm": grad_norm,
         | 
| 205 | 
            +
                                "step_time": step_time,
         | 
| 206 | 
            +
                                "avg_loader_time": avg_loader_time,
         | 
| 207 | 
            +
                            }
         | 
| 208 | 
            +
                            dashboard_logger.train_epoch_stats(global_step, train_stats)
         | 
| 209 | 
            +
                            figures = {
         | 
| 210 | 
            +
                                "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
         | 
| 211 | 
            +
                            }
         | 
| 212 | 
            +
                            dashboard_logger.train_figures(global_step, figures)
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                        if global_step % c.print_step == 0:
         | 
| 215 | 
            +
                            print(
         | 
| 216 | 
            +
                                "   | > Step:{}  Loss:{:.5f}  GradNorm:{:.5f}  "
         | 
| 217 | 
            +
                                "StepTime:{:.2f}  LoaderTime:{:.2f}  AvGLoaderTime:{:.2f}  LR:{:.6f}".format(
         | 
| 218 | 
            +
                                    global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
         | 
| 219 | 
            +
                                ),
         | 
| 220 | 
            +
                                flush=True,
         | 
| 221 | 
            +
                            )
         | 
| 222 | 
            +
             | 
| 223 | 
            +
                        if global_step % c.save_step == 0:
         | 
| 224 | 
            +
                            # save model
         | 
| 225 | 
            +
                            save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
         | 
| 226 | 
            +
             | 
| 227 | 
            +
                        end_time = time.time()
         | 
| 228 | 
            +
             | 
| 229 | 
            +
                    print("")
         | 
| 230 | 
            +
                    print(
         | 
| 231 | 
            +
                        ">>> Epoch:{}  AvgLoss: {:.5f} GradNorm:{:.5f}  "
         | 
| 232 | 
            +
                        "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
         | 
| 233 | 
            +
                            epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
         | 
| 234 | 
            +
                        ),
         | 
| 235 | 
            +
                        flush=True,
         | 
| 236 | 
            +
                    )
         | 
| 237 | 
            +
                    # evaluation
         | 
| 238 | 
            +
                    if c.run_eval:
         | 
| 239 | 
            +
                        model.eval()
         | 
| 240 | 
            +
                        eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
         | 
| 241 | 
            +
                        print("\n\n")
         | 
| 242 | 
            +
                        print("--> EVAL PERFORMANCE")
         | 
| 243 | 
            +
                        print(
         | 
| 244 | 
            +
                            "   | > Epoch:{}  AvgLoss: {:.5f} ".format(epoch, eval_loss),
         | 
| 245 | 
            +
                            flush=True,
         | 
| 246 | 
            +
                        )
         | 
| 247 | 
            +
                        # save the best checkpoint
         | 
| 248 | 
            +
                        best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
         | 
| 249 | 
            +
                        model.train()
         | 
| 250 | 
            +
             | 
| 251 | 
            +
                return best_loss, global_step
         | 
| 252 | 
            +
             | 
| 253 | 
            +
             | 
| 254 | 
            +
            def main(args):  # pylint: disable=redefined-outer-name
         | 
| 255 | 
            +
                # pylint: disable=global-variable-undefined
         | 
| 256 | 
            +
                global meta_data_train
         | 
| 257 | 
            +
                global meta_data_eval
         | 
| 258 | 
            +
                global train_classes
         | 
| 259 | 
            +
             | 
| 260 | 
            +
                ap = AudioProcessor(**c.audio)
         | 
| 261 | 
            +
                model = setup_encoder_model(c)
         | 
| 262 | 
            +
             | 
| 263 | 
            +
                optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
         | 
| 264 | 
            +
             | 
| 265 | 
            +
                # pylint: disable=redefined-outer-name
         | 
| 266 | 
            +
                meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
         | 
| 267 | 
            +
             | 
| 268 | 
            +
                train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
         | 
| 269 | 
            +
                if c.run_eval:
         | 
| 270 | 
            +
                    eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
         | 
| 271 | 
            +
                else:
         | 
| 272 | 
            +
                    eval_data_loader = None
         | 
| 273 | 
            +
             | 
| 274 | 
            +
                num_classes = len(train_classes)
         | 
| 275 | 
            +
                criterion = model.get_criterion(c, num_classes)
         | 
| 276 | 
            +
             | 
| 277 | 
            +
                if c.loss == "softmaxproto" and c.model != "speaker_encoder":
         | 
| 278 | 
            +
                    c.map_classid_to_classname = map_classid_to_classname
         | 
| 279 | 
            +
                    copy_model_files(c, OUT_PATH)
         | 
| 280 | 
            +
             | 
| 281 | 
            +
                if args.restore_path:
         | 
| 282 | 
            +
                    criterion, args.restore_step = model.load_checkpoint(
         | 
| 283 | 
            +
                        c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
         | 
| 284 | 
            +
                    )
         | 
| 285 | 
            +
                    print(" > Model restored from step %d" % args.restore_step, flush=True)
         | 
| 286 | 
            +
                else:
         | 
| 287 | 
            +
                    args.restore_step = 0
         | 
| 288 | 
            +
             | 
| 289 | 
            +
                if c.lr_decay:
         | 
| 290 | 
            +
                    scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
         | 
| 291 | 
            +
                else:
         | 
| 292 | 
            +
                    scheduler = None
         | 
| 293 | 
            +
             | 
| 294 | 
            +
                num_params = count_parameters(model)
         | 
| 295 | 
            +
                print("\n > Model has {} parameters".format(num_params), flush=True)
         | 
| 296 | 
            +
             | 
| 297 | 
            +
                if use_cuda:
         | 
| 298 | 
            +
                    model = model.cuda()
         | 
| 299 | 
            +
                    criterion.cuda()
         | 
| 300 | 
            +
             | 
| 301 | 
            +
                global_step = args.restore_step
         | 
| 302 | 
            +
                _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
         | 
| 303 | 
            +
             | 
| 304 | 
            +
             | 
| 305 | 
            +
            if __name__ == "__main__":
         | 
| 306 | 
            +
                args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
         | 
| 307 | 
            +
             | 
| 308 | 
            +
                try:
         | 
| 309 | 
            +
                    main(args)
         | 
| 310 | 
            +
                except KeyboardInterrupt:
         | 
| 311 | 
            +
                    remove_experiment_folder(OUT_PATH)
         | 
| 312 | 
            +
                    try:
         | 
| 313 | 
            +
                        sys.exit(0)
         | 
| 314 | 
            +
                    except SystemExit:
         | 
| 315 | 
            +
                        os._exit(0)  # pylint: disable=protected-access
         | 
| 316 | 
            +
                except Exception:  # pylint: disable=broad-except
         | 
| 317 | 
            +
                    remove_experiment_folder(OUT_PATH)
         | 
| 318 | 
            +
                    traceback.print_exc()
         | 
| 319 | 
            +
                    sys.exit(1)
         | 
    	
        TTS/bin/train_tts.py
    ADDED
    
    | @@ -0,0 +1,71 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
            from dataclasses import dataclass, field
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            from trainer import Trainer, TrainerArgs
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from TTS.config import load_config, register_config
         | 
| 7 | 
            +
            from TTS.tts.datasets import load_tts_samples
         | 
| 8 | 
            +
            from TTS.tts.models import setup_model
         | 
| 9 | 
            +
             | 
| 10 | 
            +
             | 
| 11 | 
            +
            @dataclass
         | 
| 12 | 
            +
            class TrainTTSArgs(TrainerArgs):
         | 
| 13 | 
            +
                config_path: str = field(default=None, metadata={"help": "Path to the config file."})
         | 
| 14 | 
            +
             | 
| 15 | 
            +
             | 
| 16 | 
            +
            def main():
         | 
| 17 | 
            +
                """Run `tts` model training directly by a `config.json` file."""
         | 
| 18 | 
            +
                # init trainer args
         | 
| 19 | 
            +
                train_args = TrainTTSArgs()
         | 
| 20 | 
            +
                parser = train_args.init_argparse(arg_prefix="")
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                # override trainer args from comman-line args
         | 
| 23 | 
            +
                args, config_overrides = parser.parse_known_args()
         | 
| 24 | 
            +
                train_args.parse_args(args)
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                # load config.json and register
         | 
| 27 | 
            +
                if args.config_path or args.continue_path:
         | 
| 28 | 
            +
                    if args.config_path:
         | 
| 29 | 
            +
                        # init from a file
         | 
| 30 | 
            +
                        config = load_config(args.config_path)
         | 
| 31 | 
            +
                        if len(config_overrides) > 0:
         | 
| 32 | 
            +
                            config.parse_known_args(config_overrides, relaxed_parser=True)
         | 
| 33 | 
            +
                    elif args.continue_path:
         | 
| 34 | 
            +
                        # continue from a prev experiment
         | 
| 35 | 
            +
                        config = load_config(os.path.join(args.continue_path, "config.json"))
         | 
| 36 | 
            +
                        if len(config_overrides) > 0:
         | 
| 37 | 
            +
                            config.parse_known_args(config_overrides, relaxed_parser=True)
         | 
| 38 | 
            +
                    else:
         | 
| 39 | 
            +
                        # init from console args
         | 
| 40 | 
            +
                        from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                        config_base = BaseTrainingConfig()
         | 
| 43 | 
            +
                        config_base.parse_known_args(config_overrides)
         | 
| 44 | 
            +
                        config = register_config(config_base.model)()
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                # load training samples
         | 
| 47 | 
            +
                train_samples, eval_samples = load_tts_samples(
         | 
| 48 | 
            +
                    config.datasets,
         | 
| 49 | 
            +
                    eval_split=True,
         | 
| 50 | 
            +
                    eval_split_max_size=config.eval_split_max_size,
         | 
| 51 | 
            +
                    eval_split_size=config.eval_split_size,
         | 
| 52 | 
            +
                )
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                # init the model from config
         | 
| 55 | 
            +
                model = setup_model(config, train_samples + eval_samples)
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                # init the trainer and 🚀
         | 
| 58 | 
            +
                trainer = Trainer(
         | 
| 59 | 
            +
                    train_args,
         | 
| 60 | 
            +
                    model.config,
         | 
| 61 | 
            +
                    config.output_path,
         | 
| 62 | 
            +
                    model=model,
         | 
| 63 | 
            +
                    train_samples=train_samples,
         | 
| 64 | 
            +
                    eval_samples=eval_samples,
         | 
| 65 | 
            +
                    parse_command_line_args=False,
         | 
| 66 | 
            +
                )
         | 
| 67 | 
            +
                trainer.fit()
         | 
| 68 | 
            +
             | 
| 69 | 
            +
             | 
| 70 | 
            +
            if __name__ == "__main__":
         | 
| 71 | 
            +
                main()
         | 
    	
        TTS/bin/train_vocoder.py
    ADDED
    
    | @@ -0,0 +1,77 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
            from dataclasses import dataclass, field
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            from trainer import Trainer, TrainerArgs
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from TTS.config import load_config, register_config
         | 
| 7 | 
            +
            from TTS.utils.audio import AudioProcessor
         | 
| 8 | 
            +
            from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
         | 
| 9 | 
            +
            from TTS.vocoder.models import setup_model
         | 
| 10 | 
            +
             | 
| 11 | 
            +
             | 
| 12 | 
            +
            @dataclass
         | 
| 13 | 
            +
            class TrainVocoderArgs(TrainerArgs):
         | 
| 14 | 
            +
                config_path: str = field(default=None, metadata={"help": "Path to the config file."})
         | 
| 15 | 
            +
             | 
| 16 | 
            +
             | 
| 17 | 
            +
            def main():
         | 
| 18 | 
            +
                """Run `tts` model training directly by a `config.json` file."""
         | 
| 19 | 
            +
                # init trainer args
         | 
| 20 | 
            +
                train_args = TrainVocoderArgs()
         | 
| 21 | 
            +
                parser = train_args.init_argparse(arg_prefix="")
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                # override trainer args from comman-line args
         | 
| 24 | 
            +
                args, config_overrides = parser.parse_known_args()
         | 
| 25 | 
            +
                train_args.parse_args(args)
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                # load config.json and register
         | 
| 28 | 
            +
                if args.config_path or args.continue_path:
         | 
| 29 | 
            +
                    if args.config_path:
         | 
| 30 | 
            +
                        # init from a file
         | 
| 31 | 
            +
                        config = load_config(args.config_path)
         | 
| 32 | 
            +
                        if len(config_overrides) > 0:
         | 
| 33 | 
            +
                            config.parse_known_args(config_overrides, relaxed_parser=True)
         | 
| 34 | 
            +
                    elif args.continue_path:
         | 
| 35 | 
            +
                        # continue from a prev experiment
         | 
| 36 | 
            +
                        config = load_config(os.path.join(args.continue_path, "config.json"))
         | 
| 37 | 
            +
                        if len(config_overrides) > 0:
         | 
| 38 | 
            +
                            config.parse_known_args(config_overrides, relaxed_parser=True)
         | 
| 39 | 
            +
                    else:
         | 
| 40 | 
            +
                        # init from console args
         | 
| 41 | 
            +
                        from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                        config_base = BaseTrainingConfig()
         | 
| 44 | 
            +
                        config_base.parse_known_args(config_overrides)
         | 
| 45 | 
            +
                        config = register_config(config_base.model)()
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                # load training samples
         | 
| 48 | 
            +
                if "feature_path" in config and config.feature_path:
         | 
| 49 | 
            +
                    # load pre-computed features
         | 
| 50 | 
            +
                    print(f" > Loading features from: {config.feature_path}")
         | 
| 51 | 
            +
                    eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
         | 
| 52 | 
            +
                else:
         | 
| 53 | 
            +
                    # load data raw wav files
         | 
| 54 | 
            +
                    eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                # setup audio processor
         | 
| 57 | 
            +
                ap = AudioProcessor(**config.audio)
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                # init the model from config
         | 
| 60 | 
            +
                model = setup_model(config)
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                # init the trainer and 🚀
         | 
| 63 | 
            +
                trainer = Trainer(
         | 
| 64 | 
            +
                    train_args,
         | 
| 65 | 
            +
                    config,
         | 
| 66 | 
            +
                    config.output_path,
         | 
| 67 | 
            +
                    model=model,
         | 
| 68 | 
            +
                    train_samples=train_samples,
         | 
| 69 | 
            +
                    eval_samples=eval_samples,
         | 
| 70 | 
            +
                    training_assets={"audio_processor": ap},
         | 
| 71 | 
            +
                    parse_command_line_args=False,
         | 
| 72 | 
            +
                )
         | 
| 73 | 
            +
                trainer.fit()
         | 
| 74 | 
            +
             | 
| 75 | 
            +
             | 
| 76 | 
            +
            if __name__ == "__main__":
         | 
| 77 | 
            +
                main()
         | 
    	
        TTS/bin/tune_wavegrad.py
    ADDED
    
    | @@ -0,0 +1,103 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """Search a good noise schedule for WaveGrad for a given number of inference iterations"""
         | 
| 2 | 
            +
            import argparse
         | 
| 3 | 
            +
            from itertools import product as cartesian_product
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            import numpy as np
         | 
| 6 | 
            +
            import torch
         | 
| 7 | 
            +
            from torch.utils.data import DataLoader
         | 
| 8 | 
            +
            from tqdm import tqdm
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            from TTS.config import load_config
         | 
| 11 | 
            +
            from TTS.utils.audio import AudioProcessor
         | 
| 12 | 
            +
            from TTS.vocoder.datasets.preprocess import load_wav_data
         | 
| 13 | 
            +
            from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
         | 
| 14 | 
            +
            from TTS.vocoder.models import setup_model
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            if __name__ == "__main__":
         | 
| 17 | 
            +
                parser = argparse.ArgumentParser()
         | 
| 18 | 
            +
                parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
         | 
| 19 | 
            +
                parser.add_argument("--config_path", type=str, help="Path to model config file.")
         | 
| 20 | 
            +
                parser.add_argument("--data_path", type=str, help="Path to data directory.")
         | 
| 21 | 
            +
                parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
         | 
| 22 | 
            +
                parser.add_argument(
         | 
| 23 | 
            +
                    "--num_iter",
         | 
| 24 | 
            +
                    type=int,
         | 
| 25 | 
            +
                    help="Number of model inference iterations that you like to optimize noise schedule for.",
         | 
| 26 | 
            +
                )
         | 
| 27 | 
            +
                parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
         | 
| 28 | 
            +
                parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
         | 
| 29 | 
            +
                parser.add_argument(
         | 
| 30 | 
            +
                    "--search_depth",
         | 
| 31 | 
            +
                    type=int,
         | 
| 32 | 
            +
                    default=3,
         | 
| 33 | 
            +
                    help="Search granularity. Increasing this increases the run-time exponentially.",
         | 
| 34 | 
            +
                )
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                # load config
         | 
| 37 | 
            +
                args = parser.parse_args()
         | 
| 38 | 
            +
                config = load_config(args.config_path)
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                # setup audio processor
         | 
| 41 | 
            +
                ap = AudioProcessor(**config.audio)
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                # load dataset
         | 
| 44 | 
            +
                _, train_data = load_wav_data(args.data_path, 0)
         | 
| 45 | 
            +
                train_data = train_data[: args.num_samples]
         | 
| 46 | 
            +
                dataset = WaveGradDataset(
         | 
| 47 | 
            +
                    ap=ap,
         | 
| 48 | 
            +
                    items=train_data,
         | 
| 49 | 
            +
                    seq_len=-1,
         | 
| 50 | 
            +
                    hop_len=ap.hop_length,
         | 
| 51 | 
            +
                    pad_short=config.pad_short,
         | 
| 52 | 
            +
                    conv_pad=config.conv_pad,
         | 
| 53 | 
            +
                    is_training=True,
         | 
| 54 | 
            +
                    return_segments=False,
         | 
| 55 | 
            +
                    use_noise_augment=False,
         | 
| 56 | 
            +
                    use_cache=False,
         | 
| 57 | 
            +
                    verbose=True,
         | 
| 58 | 
            +
                )
         | 
| 59 | 
            +
                loader = DataLoader(
         | 
| 60 | 
            +
                    dataset,
         | 
| 61 | 
            +
                    batch_size=1,
         | 
| 62 | 
            +
                    shuffle=False,
         | 
| 63 | 
            +
                    collate_fn=dataset.collate_full_clips,
         | 
| 64 | 
            +
                    drop_last=False,
         | 
| 65 | 
            +
                    num_workers=config.num_loader_workers,
         | 
| 66 | 
            +
                    pin_memory=False,
         | 
| 67 | 
            +
                )
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                # setup the model
         | 
| 70 | 
            +
                model = setup_model(config)
         | 
| 71 | 
            +
                if args.use_cuda:
         | 
| 72 | 
            +
                    model.cuda()
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                # setup optimization parameters
         | 
| 75 | 
            +
                base_values = sorted(10 * np.random.uniform(size=args.search_depth))
         | 
| 76 | 
            +
                print(f" > base values: {base_values}")
         | 
| 77 | 
            +
                exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
         | 
| 78 | 
            +
                best_error = float("inf")
         | 
| 79 | 
            +
                best_schedule = None  # pylint: disable=C0103
         | 
| 80 | 
            +
                total_search_iter = len(base_values) ** args.num_iter
         | 
| 81 | 
            +
                for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
         | 
| 82 | 
            +
                    beta = exponents * base
         | 
| 83 | 
            +
                    model.compute_noise_level(beta)
         | 
| 84 | 
            +
                    for data in loader:
         | 
| 85 | 
            +
                        mel, audio = data
         | 
| 86 | 
            +
                        y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                        if args.use_cuda:
         | 
| 89 | 
            +
                            y_hat = y_hat.cpu()
         | 
| 90 | 
            +
                        y_hat = y_hat.numpy()
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                        mel_hat = []
         | 
| 93 | 
            +
                        for i in range(y_hat.shape[0]):
         | 
| 94 | 
            +
                            m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
         | 
| 95 | 
            +
                            mel_hat.append(torch.from_numpy(m))
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                        mel_hat = torch.stack(mel_hat)
         | 
| 98 | 
            +
                        mse = torch.sum((mel - mel_hat) ** 2).mean()
         | 
| 99 | 
            +
                        if mse.item() < best_error:
         | 
| 100 | 
            +
                            best_error = mse.item()
         | 
| 101 | 
            +
                            best_schedule = {"beta": beta}
         | 
| 102 | 
            +
                            print(f" > Found a better schedule. - MSE: {mse.item()}")
         | 
| 103 | 
            +
                            np.save(args.output_path, best_schedule)
         | 
    	
        TTS/config/__init__.py
    ADDED
    
    | @@ -0,0 +1,138 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import json
         | 
| 2 | 
            +
            import os
         | 
| 3 | 
            +
            import re
         | 
| 4 | 
            +
            from typing import Dict
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            import fsspec
         | 
| 7 | 
            +
            import yaml
         | 
| 8 | 
            +
            from coqpit import Coqpit
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            from TTS.config.shared_configs import *
         | 
| 11 | 
            +
            from TTS.utils.generic_utils import find_module
         | 
| 12 | 
            +
             | 
| 13 | 
            +
             | 
| 14 | 
            +
            def read_json_with_comments(json_path):
         | 
| 15 | 
            +
                """for backward compat."""
         | 
| 16 | 
            +
                # fallback to json
         | 
| 17 | 
            +
                with fsspec.open(json_path, "r", encoding="utf-8") as f:
         | 
| 18 | 
            +
                    input_str = f.read()
         | 
| 19 | 
            +
                # handle comments
         | 
| 20 | 
            +
                input_str = re.sub(r"\\\n", "", input_str)
         | 
| 21 | 
            +
                input_str = re.sub(r"//.*\n", "\n", input_str)
         | 
| 22 | 
            +
                data = json.loads(input_str)
         | 
| 23 | 
            +
                return data
         | 
| 24 | 
            +
             | 
| 25 | 
            +
             | 
| 26 | 
            +
            def register_config(model_name: str) -> Coqpit:
         | 
| 27 | 
            +
                """Find the right config for the given model name.
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                Args:
         | 
| 30 | 
            +
                    model_name (str): Model name.
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                Raises:
         | 
| 33 | 
            +
                    ModuleNotFoundError: No matching config for the model name.
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                Returns:
         | 
| 36 | 
            +
                    Coqpit: config class.
         | 
| 37 | 
            +
                """
         | 
| 38 | 
            +
                config_class = None
         | 
| 39 | 
            +
                config_name = model_name + "_config"
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                # TODO: fix this
         | 
| 42 | 
            +
                if model_name == "xtts":
         | 
| 43 | 
            +
                    from TTS.tts.configs.xtts_config import XttsConfig
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                    config_class = XttsConfig
         | 
| 46 | 
            +
                paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
         | 
| 47 | 
            +
                for path in paths:
         | 
| 48 | 
            +
                    try:
         | 
| 49 | 
            +
                        config_class = find_module(path, config_name)
         | 
| 50 | 
            +
                    except ModuleNotFoundError:
         | 
| 51 | 
            +
                        pass
         | 
| 52 | 
            +
                if config_class is None:
         | 
| 53 | 
            +
                    raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
         | 
| 54 | 
            +
                return config_class
         | 
| 55 | 
            +
             | 
| 56 | 
            +
             | 
| 57 | 
            +
            def _process_model_name(config_dict: Dict) -> str:
         | 
| 58 | 
            +
                """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                Args:
         | 
| 61 | 
            +
                    config_dict (Dict): A dictionary including the config fields.
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                Returns:
         | 
| 64 | 
            +
                    str: Formatted modelname.
         | 
| 65 | 
            +
                """
         | 
| 66 | 
            +
                model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
         | 
| 67 | 
            +
                model_name = model_name.replace("_generator", "").replace("_discriminator", "")
         | 
| 68 | 
            +
                return model_name
         | 
| 69 | 
            +
             | 
| 70 | 
            +
             | 
| 71 | 
            +
            def load_config(config_path: str) -> Coqpit:
         | 
| 72 | 
            +
                """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
         | 
| 73 | 
            +
                to find the corresponding Config class. Then initialize the Config.
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                Args:
         | 
| 76 | 
            +
                    config_path (str): path to the config file.
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                Raises:
         | 
| 79 | 
            +
                    TypeError: given config file has an unknown type.
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                Returns:
         | 
| 82 | 
            +
                    Coqpit: TTS config object.
         | 
| 83 | 
            +
                """
         | 
| 84 | 
            +
                config_dict = {}
         | 
| 85 | 
            +
                ext = os.path.splitext(config_path)[1]
         | 
| 86 | 
            +
                if ext in (".yml", ".yaml"):
         | 
| 87 | 
            +
                    with fsspec.open(config_path, "r", encoding="utf-8") as f:
         | 
| 88 | 
            +
                        data = yaml.safe_load(f)
         | 
| 89 | 
            +
                elif ext == ".json":
         | 
| 90 | 
            +
                    try:
         | 
| 91 | 
            +
                        with fsspec.open(config_path, "r", encoding="utf-8") as f:
         | 
| 92 | 
            +
                            data = json.load(f)
         | 
| 93 | 
            +
                    except json.decoder.JSONDecodeError:
         | 
| 94 | 
            +
                        # backwards compat.
         | 
| 95 | 
            +
                        data = read_json_with_comments(config_path)
         | 
| 96 | 
            +
                else:
         | 
| 97 | 
            +
                    raise TypeError(f" [!] Unknown config file type {ext}")
         | 
| 98 | 
            +
                config_dict.update(data)
         | 
| 99 | 
            +
                model_name = _process_model_name(config_dict)
         | 
| 100 | 
            +
                config_class = register_config(model_name.lower())
         | 
| 101 | 
            +
                config = config_class()
         | 
| 102 | 
            +
                config.from_dict(config_dict)
         | 
| 103 | 
            +
                return config
         | 
| 104 | 
            +
             | 
| 105 | 
            +
             | 
| 106 | 
            +
            def check_config_and_model_args(config, arg_name, value):
         | 
| 107 | 
            +
                """Check the give argument in `config.model_args` if exist or in `config` for
         | 
| 108 | 
            +
                the given value.
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                Return False if the argument does not exist in `config.model_args` or `config`.
         | 
| 111 | 
            +
                This is to patch up the compatibility between models with and without `model_args`.
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                TODO: Remove this in the future with a unified approach.
         | 
| 114 | 
            +
                """
         | 
| 115 | 
            +
                if hasattr(config, "model_args"):
         | 
| 116 | 
            +
                    if arg_name in config.model_args:
         | 
| 117 | 
            +
                        return config.model_args[arg_name] == value
         | 
| 118 | 
            +
                if hasattr(config, arg_name):
         | 
| 119 | 
            +
                    return config[arg_name] == value
         | 
| 120 | 
            +
                return False
         | 
| 121 | 
            +
             | 
| 122 | 
            +
             | 
| 123 | 
            +
            def get_from_config_or_model_args(config, arg_name):
         | 
| 124 | 
            +
                """Get the given argument from `config.model_args` if exist or in `config`."""
         | 
| 125 | 
            +
                if hasattr(config, "model_args"):
         | 
| 126 | 
            +
                    if arg_name in config.model_args:
         | 
| 127 | 
            +
                        return config.model_args[arg_name]
         | 
| 128 | 
            +
                return config[arg_name]
         | 
| 129 | 
            +
             | 
| 130 | 
            +
             | 
| 131 | 
            +
            def get_from_config_or_model_args_with_default(config, arg_name, def_val):
         | 
| 132 | 
            +
                """Get the given argument from `config.model_args` if exist or in `config`."""
         | 
| 133 | 
            +
                if hasattr(config, "model_args"):
         | 
| 134 | 
            +
                    if arg_name in config.model_args:
         | 
| 135 | 
            +
                        return config.model_args[arg_name]
         | 
| 136 | 
            +
                if hasattr(config, arg_name):
         | 
| 137 | 
            +
                    return config[arg_name]
         | 
| 138 | 
            +
                return def_val
         | 
    	
        TTS/config/__pycache__/__init__.cpython-39.pyc
    ADDED
    
    | Binary file (4.08 kB). View file | 
|  | 
    	
        TTS/config/__pycache__/shared_configs.cpython-39.pyc
    ADDED
    
    | Binary file (9.52 kB). View file | 
|  | 
    	
        TTS/config/shared_configs.py
    ADDED
    
    | @@ -0,0 +1,268 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from dataclasses import asdict, dataclass
         | 
| 2 | 
            +
            from typing import List
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            from coqpit import Coqpit, check_argument
         | 
| 5 | 
            +
            from trainer import TrainerConfig
         | 
| 6 | 
            +
             | 
| 7 | 
            +
             | 
| 8 | 
            +
            @dataclass
         | 
| 9 | 
            +
            class BaseAudioConfig(Coqpit):
         | 
| 10 | 
            +
                """Base config to definge audio processing parameters. It is used to initialize
         | 
| 11 | 
            +
                ```TTS.utils.audio.AudioProcessor.```
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                Args:
         | 
| 14 | 
            +
                    fft_size (int):
         | 
| 15 | 
            +
                        Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                    win_length (int):
         | 
| 18 | 
            +
                        Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
         | 
| 19 | 
            +
                        ```fft_size```. Defaults to 1024.
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                    hop_length (int):
         | 
| 22 | 
            +
                        Number of audio samples between adjacent STFT columns. Defaults to 1024.
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    frame_shift_ms (int):
         | 
| 25 | 
            +
                        Set ```hop_length``` based on milliseconds and sampling rate.
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    frame_length_ms (int):
         | 
| 28 | 
            +
                        Set ```win_length``` based on milliseconds and sampling rate.
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    stft_pad_mode (str):
         | 
| 31 | 
            +
                        Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                    sample_rate (int):
         | 
| 34 | 
            +
                        Audio sampling rate. Defaults to 22050.
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    resample (bool):
         | 
| 37 | 
            +
                        Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                    preemphasis (float):
         | 
| 40 | 
            +
                        Preemphasis coefficient. Defaults to 0.0.
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                    ref_level_db (int): 20
         | 
| 43 | 
            +
                        Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
         | 
| 44 | 
            +
                        Defaults to 20.
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                    do_sound_norm (bool):
         | 
| 47 | 
            +
                        Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                    log_func (str):
         | 
| 50 | 
            +
                        Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                    do_trim_silence (bool):
         | 
| 53 | 
            +
                        Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                    do_amp_to_db_linear (bool, optional):
         | 
| 56 | 
            +
                        enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                    do_amp_to_db_mel (bool, optional):
         | 
| 59 | 
            +
                        enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                    pitch_fmax (float, optional):
         | 
| 62 | 
            +
                        Maximum frequency of the F0 frames. Defaults to ```640```.
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                    pitch_fmin (float, optional):
         | 
| 65 | 
            +
                        Minimum frequency of the F0 frames. Defaults to ```1```.
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                    trim_db (int):
         | 
| 68 | 
            +
                        Silence threshold used for silence trimming. Defaults to 45.
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    do_rms_norm (bool, optional):
         | 
| 71 | 
            +
                        enable/disable RMS volume normalization when loading an audio file. Defaults to False.
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                    db_level (int, optional):
         | 
| 74 | 
            +
                        dB level used for rms normalization. The range is -99 to 0. Defaults to None.
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                    power (float):
         | 
| 77 | 
            +
                        Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
         | 
| 78 | 
            +
                        artifacts in the synthesized voice. Defaults to 1.5.
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                    griffin_lim_iters (int):
         | 
| 81 | 
            +
                        Number of Griffing Lim iterations. Defaults to 60.
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                    num_mels (int):
         | 
| 84 | 
            +
                        Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                    mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
         | 
| 87 | 
            +
                        It needs to be adjusted for a dataset. Defaults to 0.
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                    mel_fmax (float):
         | 
| 90 | 
            +
                        Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                    spec_gain (int):
         | 
| 93 | 
            +
                        Gain applied when converting amplitude to DB. Defaults to 20.
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                    signal_norm (bool):
         | 
| 96 | 
            +
                        enable/disable signal normalization. Defaults to True.
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                    min_level_db (int):
         | 
| 99 | 
            +
                        minimum db threshold for the computed melspectrograms. Defaults to -100.
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                    symmetric_norm (bool):
         | 
| 102 | 
            +
                        enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
         | 
| 103 | 
            +
                        [0, k], Defaults to True.
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                    max_norm (float):
         | 
| 106 | 
            +
                        ```k``` defining the normalization range. Defaults to 4.0.
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                    clip_norm (bool):
         | 
| 109 | 
            +
                        enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                    stats_path (str):
         | 
| 112 | 
            +
                        Path to the computed stats file. Defaults to None.
         | 
| 113 | 
            +
                """
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                # stft parameters
         | 
| 116 | 
            +
                fft_size: int = 1024
         | 
| 117 | 
            +
                win_length: int = 1024
         | 
| 118 | 
            +
                hop_length: int = 256
         | 
| 119 | 
            +
                frame_shift_ms: int = None
         | 
| 120 | 
            +
                frame_length_ms: int = None
         | 
| 121 | 
            +
                stft_pad_mode: str = "reflect"
         | 
| 122 | 
            +
                # audio processing parameters
         | 
| 123 | 
            +
                sample_rate: int = 22050
         | 
| 124 | 
            +
                resample: bool = False
         | 
| 125 | 
            +
                preemphasis: float = 0.0
         | 
| 126 | 
            +
                ref_level_db: int = 20
         | 
| 127 | 
            +
                do_sound_norm: bool = False
         | 
| 128 | 
            +
                log_func: str = "np.log10"
         | 
| 129 | 
            +
                # silence trimming
         | 
| 130 | 
            +
                do_trim_silence: bool = True
         | 
| 131 | 
            +
                trim_db: int = 45
         | 
| 132 | 
            +
                # rms volume normalization
         | 
| 133 | 
            +
                do_rms_norm: bool = False
         | 
| 134 | 
            +
                db_level: float = None
         | 
| 135 | 
            +
                # griffin-lim params
         | 
| 136 | 
            +
                power: float = 1.5
         | 
| 137 | 
            +
                griffin_lim_iters: int = 60
         | 
| 138 | 
            +
                # mel-spec params
         | 
| 139 | 
            +
                num_mels: int = 80
         | 
| 140 | 
            +
                mel_fmin: float = 0.0
         | 
| 141 | 
            +
                mel_fmax: float = None
         | 
| 142 | 
            +
                spec_gain: int = 20
         | 
| 143 | 
            +
                do_amp_to_db_linear: bool = True
         | 
| 144 | 
            +
                do_amp_to_db_mel: bool = True
         | 
| 145 | 
            +
                # f0 params
         | 
| 146 | 
            +
                pitch_fmax: float = 640.0
         | 
| 147 | 
            +
                pitch_fmin: float = 1.0
         | 
| 148 | 
            +
                # normalization params
         | 
| 149 | 
            +
                signal_norm: bool = True
         | 
| 150 | 
            +
                min_level_db: int = -100
         | 
| 151 | 
            +
                symmetric_norm: bool = True
         | 
| 152 | 
            +
                max_norm: float = 4.0
         | 
| 153 | 
            +
                clip_norm: bool = True
         | 
| 154 | 
            +
                stats_path: str = None
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                def check_values(
         | 
| 157 | 
            +
                    self,
         | 
| 158 | 
            +
                ):
         | 
| 159 | 
            +
                    """Check config fields"""
         | 
| 160 | 
            +
                    c = asdict(self)
         | 
| 161 | 
            +
                    check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
         | 
| 162 | 
            +
                    check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
         | 
| 163 | 
            +
                    check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
         | 
| 164 | 
            +
                    check_argument(
         | 
| 165 | 
            +
                        "frame_length_ms",
         | 
| 166 | 
            +
                        c,
         | 
| 167 | 
            +
                        restricted=True,
         | 
| 168 | 
            +
                        min_val=10,
         | 
| 169 | 
            +
                        max_val=1000,
         | 
| 170 | 
            +
                        alternative="win_length",
         | 
| 171 | 
            +
                    )
         | 
| 172 | 
            +
                    check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
         | 
| 173 | 
            +
                    check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
         | 
| 174 | 
            +
                    check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
         | 
| 175 | 
            +
                    check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
         | 
| 176 | 
            +
                    check_argument("power", c, restricted=True, min_val=1, max_val=5)
         | 
| 177 | 
            +
                    check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                    # normalization parameters
         | 
| 180 | 
            +
                    check_argument("signal_norm", c, restricted=True)
         | 
| 181 | 
            +
                    check_argument("symmetric_norm", c, restricted=True)
         | 
| 182 | 
            +
                    check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
         | 
| 183 | 
            +
                    check_argument("clip_norm", c, restricted=True)
         | 
| 184 | 
            +
                    check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
         | 
| 185 | 
            +
                    check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
         | 
| 186 | 
            +
                    check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
         | 
| 187 | 
            +
                    check_argument("do_trim_silence", c, restricted=True)
         | 
| 188 | 
            +
                    check_argument("trim_db", c, restricted=True)
         | 
| 189 | 
            +
             | 
| 190 | 
            +
             | 
| 191 | 
            +
            @dataclass
         | 
| 192 | 
            +
            class BaseDatasetConfig(Coqpit):
         | 
| 193 | 
            +
                """Base config for TTS datasets.
         | 
| 194 | 
            +
             | 
| 195 | 
            +
                Args:
         | 
| 196 | 
            +
                    formatter (str):
         | 
| 197 | 
            +
                        Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
         | 
| 198 | 
            +
             | 
| 199 | 
            +
                    dataset_name (str):
         | 
| 200 | 
            +
                        Unique name for the dataset. Defaults to `""`.
         | 
| 201 | 
            +
             | 
| 202 | 
            +
                    path (str):
         | 
| 203 | 
            +
                        Root path to the dataset files. Defaults to `""`.
         | 
| 204 | 
            +
             | 
| 205 | 
            +
                    meta_file_train (str):
         | 
| 206 | 
            +
                        Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
         | 
| 207 | 
            +
                        Defaults to `""`.
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                    ignored_speakers (List):
         | 
| 210 | 
            +
                        List of speakers IDs that are not used at the training. Default None.
         | 
| 211 | 
            +
             | 
| 212 | 
            +
                    language (str):
         | 
| 213 | 
            +
                        Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
         | 
| 214 | 
            +
             | 
| 215 | 
            +
                    phonemizer (str):
         | 
| 216 | 
            +
                        Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
         | 
| 217 | 
            +
             | 
| 218 | 
            +
                    meta_file_val (str):
         | 
| 219 | 
            +
                        Name of the dataset meta file that defines the instances used at validation.
         | 
| 220 | 
            +
             | 
| 221 | 
            +
                    meta_file_attn_mask (str):
         | 
| 222 | 
            +
                        Path to the file that lists the attention mask files used with models that require attention masks to
         | 
| 223 | 
            +
                        train the duration predictor.
         | 
| 224 | 
            +
                """
         | 
| 225 | 
            +
             | 
| 226 | 
            +
                formatter: str = ""
         | 
| 227 | 
            +
                dataset_name: str = ""
         | 
| 228 | 
            +
                path: str = ""
         | 
| 229 | 
            +
                meta_file_train: str = ""
         | 
| 230 | 
            +
                ignored_speakers: List[str] = None
         | 
| 231 | 
            +
                language: str = ""
         | 
| 232 | 
            +
                phonemizer: str = ""
         | 
| 233 | 
            +
                meta_file_val: str = ""
         | 
| 234 | 
            +
                meta_file_attn_mask: str = ""
         | 
| 235 | 
            +
             | 
| 236 | 
            +
                def check_values(
         | 
| 237 | 
            +
                    self,
         | 
| 238 | 
            +
                ):
         | 
| 239 | 
            +
                    """Check config fields"""
         | 
| 240 | 
            +
                    c = asdict(self)
         | 
| 241 | 
            +
                    check_argument("formatter", c, restricted=True)
         | 
| 242 | 
            +
                    check_argument("path", c, restricted=True)
         | 
| 243 | 
            +
                    check_argument("meta_file_train", c, restricted=True)
         | 
| 244 | 
            +
                    check_argument("meta_file_val", c, restricted=False)
         | 
| 245 | 
            +
                    check_argument("meta_file_attn_mask", c, restricted=False)
         | 
| 246 | 
            +
             | 
| 247 | 
            +
             | 
| 248 | 
            +
            @dataclass
         | 
| 249 | 
            +
            class BaseTrainingConfig(TrainerConfig):
         | 
| 250 | 
            +
                """Base config to define the basic 🐸TTS training parameters that are shared
         | 
| 251 | 
            +
                among all the models. It is based on ```Trainer.TrainingConfig```.
         | 
| 252 | 
            +
             | 
| 253 | 
            +
                Args:
         | 
| 254 | 
            +
                    model (str):
         | 
| 255 | 
            +
                        Name of the model that is used in the training.
         | 
| 256 | 
            +
             | 
| 257 | 
            +
                    num_loader_workers (int):
         | 
| 258 | 
            +
                        Number of workers for training time dataloader.
         | 
| 259 | 
            +
             | 
| 260 | 
            +
                    num_eval_loader_workers (int):
         | 
| 261 | 
            +
                        Number of workers for evaluation time dataloader.
         | 
| 262 | 
            +
                """
         | 
| 263 | 
            +
             | 
| 264 | 
            +
                model: str = None
         | 
| 265 | 
            +
                # dataloading
         | 
| 266 | 
            +
                num_loader_workers: int = 0
         | 
| 267 | 
            +
                num_eval_loader_workers: int = 0
         | 
| 268 | 
            +
                use_noise_augment: bool = False
         | 
    	
        TTS/cs_api.py
    ADDED
    
    | @@ -0,0 +1,317 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import http.client
         | 
| 2 | 
            +
            import json
         | 
| 3 | 
            +
            import os
         | 
| 4 | 
            +
            import tempfile
         | 
| 5 | 
            +
            import urllib.request
         | 
| 6 | 
            +
            from typing import Tuple
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            import numpy as np
         | 
| 9 | 
            +
            import requests
         | 
| 10 | 
            +
            from scipy.io import wavfile
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            from TTS.utils.audio.numpy_transforms import save_wav
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            class Speaker(object):
         | 
| 16 | 
            +
                """Convert dict to object."""
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                def __init__(self, d, is_voice=False):
         | 
| 19 | 
            +
                    self.is_voice = is_voice
         | 
| 20 | 
            +
                    for k, v in d.items():
         | 
| 21 | 
            +
                        if isinstance(k, (list, tuple)):
         | 
| 22 | 
            +
                            setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
         | 
| 23 | 
            +
                        else:
         | 
| 24 | 
            +
                            setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                def __repr__(self):
         | 
| 27 | 
            +
                    return str(self.__dict__)
         | 
| 28 | 
            +
             | 
| 29 | 
            +
             | 
| 30 | 
            +
            class CS_API:
         | 
| 31 | 
            +
                """🐸Coqui Studio API Wrapper.
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
         | 
| 34 | 
            +
                interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
         | 
| 35 | 
            +
                characteristics. You can use these voices to generate new audio files or use them in your applications.
         | 
| 36 | 
            +
                You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
         | 
| 37 | 
            +
                You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
         | 
| 38 | 
            +
                https://app.coqui.ai/account. We can either enter the token as an environment variable as
         | 
| 39 | 
            +
                `export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
         | 
| 40 | 
            +
                Visit https://app.coqui.ai/api for more information.
         | 
| 41 | 
            +
             | 
| 42 | 
            +
             | 
| 43 | 
            +
                Args:
         | 
| 44 | 
            +
                    api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
         | 
| 45 | 
            +
                        `COQUI_STUDIO_TOKEN`.
         | 
| 46 | 
            +
                    model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
         | 
| 47 | 
            +
             | 
| 48 | 
            +
             | 
| 49 | 
            +
                Example listing all available speakers:
         | 
| 50 | 
            +
                    >>> from TTS.api import CS_API
         | 
| 51 | 
            +
                    >>> tts = CS_API()
         | 
| 52 | 
            +
                    >>> tts.speakers
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                Example listing all emotions:
         | 
| 55 | 
            +
                    >>> # emotions are only available for `V1` model
         | 
| 56 | 
            +
                    >>> from TTS.api import CS_API
         | 
| 57 | 
            +
                    >>> tts = CS_API(model="V1")
         | 
| 58 | 
            +
                    >>> tts.emotions
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                Example with a built-in 🐸 speaker:
         | 
| 61 | 
            +
                    >>> from TTS.api import CS_API
         | 
| 62 | 
            +
                    >>> tts = CS_API()
         | 
| 63 | 
            +
                    >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name)
         | 
| 64 | 
            +
                    >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                Example with multi-language model:
         | 
| 67 | 
            +
                    >>> from TTS.api import CS_API
         | 
| 68 | 
            +
                    >>> tts = CS_API(model="XTTS")
         | 
| 69 | 
            +
                    >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
         | 
| 70 | 
            +
                """
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                MODEL_ENDPOINTS = {
         | 
| 73 | 
            +
                    "V1": {
         | 
| 74 | 
            +
                        "list_speakers": "https://app.coqui.ai/api/v2/speakers",
         | 
| 75 | 
            +
                        "synthesize": "https://app.coqui.ai/api/v2/samples",
         | 
| 76 | 
            +
                        "list_voices": "https://app.coqui.ai/api/v2/voices",
         | 
| 77 | 
            +
                    },
         | 
| 78 | 
            +
                    "XTTS": {
         | 
| 79 | 
            +
                        "list_speakers": "https://app.coqui.ai/api/v2/speakers",
         | 
| 80 | 
            +
                        "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
         | 
| 81 | 
            +
                        "list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
         | 
| 82 | 
            +
                    },
         | 
| 83 | 
            +
                }
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                def __init__(self, api_token=None, model="XTTS"):
         | 
| 88 | 
            +
                    self.api_token = api_token
         | 
| 89 | 
            +
                    self.model = model
         | 
| 90 | 
            +
                    self.headers = None
         | 
| 91 | 
            +
                    self._speakers = None
         | 
| 92 | 
            +
                    self._check_token()
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                @staticmethod
         | 
| 95 | 
            +
                def ping_api():
         | 
| 96 | 
            +
                    URL = "https://coqui.gateway.scarf.sh/tts/api"
         | 
| 97 | 
            +
                    _ = requests.get(URL)
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                @property
         | 
| 100 | 
            +
                def speakers(self):
         | 
| 101 | 
            +
                    if self._speakers is None:
         | 
| 102 | 
            +
                        self._speakers = self.list_all_speakers()
         | 
| 103 | 
            +
                    return self._speakers
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                @property
         | 
| 106 | 
            +
                def emotions(self):
         | 
| 107 | 
            +
                    """Return a list of available emotions.
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                    TODO: Get this from the API endpoint.
         | 
| 110 | 
            +
                    """
         | 
| 111 | 
            +
                    if self.model == "V1":
         | 
| 112 | 
            +
                        return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
         | 
| 113 | 
            +
                    else:
         | 
| 114 | 
            +
                        raise ValueError(f"❗ Emotions are not available for {self.model}.")
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                def _check_token(self):
         | 
| 117 | 
            +
                    if self.api_token is None:
         | 
| 118 | 
            +
                        self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
         | 
| 119 | 
            +
                        self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
         | 
| 120 | 
            +
                    if not self.api_token:
         | 
| 121 | 
            +
                        raise ValueError(
         | 
| 122 | 
            +
                            "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n"
         | 
| 123 | 
            +
                            "Visit 🔗https://app.coqui.ai/account to get one.\n"
         | 
| 124 | 
            +
                            "Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
         | 
| 125 | 
            +
                            ""
         | 
| 126 | 
            +
                        )
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                def list_all_speakers(self):
         | 
| 129 | 
            +
                    """Return both built-in Coqui Studio speakers and custom voices created by the user."""
         | 
| 130 | 
            +
                    return self.list_speakers() + self.list_voices()
         | 
| 131 | 
            +
             | 
| 132 | 
            +
                def list_speakers(self):
         | 
| 133 | 
            +
                    """List built-in Coqui Studio speakers."""
         | 
| 134 | 
            +
                    self._check_token()
         | 
| 135 | 
            +
                    conn = http.client.HTTPSConnection("app.coqui.ai")
         | 
| 136 | 
            +
                    url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
         | 
| 137 | 
            +
                    conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
         | 
| 138 | 
            +
                    res = conn.getresponse()
         | 
| 139 | 
            +
                    data = res.read()
         | 
| 140 | 
            +
                    return [Speaker(s) for s in json.loads(data)["result"]]
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                def list_voices(self):
         | 
| 143 | 
            +
                    """List custom voices created by the user."""
         | 
| 144 | 
            +
                    conn = http.client.HTTPSConnection("app.coqui.ai")
         | 
| 145 | 
            +
                    url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
         | 
| 146 | 
            +
                    conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
         | 
| 147 | 
            +
                    res = conn.getresponse()
         | 
| 148 | 
            +
                    data = res.read()
         | 
| 149 | 
            +
                    return [Speaker(s, True) for s in json.loads(data)["result"]]
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                def list_speakers_as_tts_models(self):
         | 
| 152 | 
            +
                    """List speakers in ModelManager format."""
         | 
| 153 | 
            +
                    models = []
         | 
| 154 | 
            +
                    for speaker in self.speakers:
         | 
| 155 | 
            +
                        model = f"coqui_studio/multilingual/{speaker.name}/{self.model}"
         | 
| 156 | 
            +
                        models.append(model)
         | 
| 157 | 
            +
                    return models
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                def name_to_speaker(self, name):
         | 
| 160 | 
            +
                    for speaker in self.speakers:
         | 
| 161 | 
            +
                        if speaker.name == name:
         | 
| 162 | 
            +
                            return speaker
         | 
| 163 | 
            +
                    raise ValueError(f"Speaker {name} not found in {self.speakers}")
         | 
| 164 | 
            +
             | 
| 165 | 
            +
                def id_to_speaker(self, speaker_id):
         | 
| 166 | 
            +
                    for speaker in self.speakers:
         | 
| 167 | 
            +
                        if speaker.id == speaker_id:
         | 
| 168 | 
            +
                            return speaker
         | 
| 169 | 
            +
                    raise ValueError(f"Speaker {speaker_id} not found.")
         | 
| 170 | 
            +
             | 
| 171 | 
            +
                @staticmethod
         | 
| 172 | 
            +
                def url_to_np(url):
         | 
| 173 | 
            +
                    tmp_file, _ = urllib.request.urlretrieve(url)
         | 
| 174 | 
            +
                    rate, data = wavfile.read(tmp_file)
         | 
| 175 | 
            +
                    return data, rate
         | 
| 176 | 
            +
             | 
| 177 | 
            +
                @staticmethod
         | 
| 178 | 
            +
                def _create_payload(model, text, speaker, speed, emotion, language):
         | 
| 179 | 
            +
                    payload = {}
         | 
| 180 | 
            +
                    # if speaker.is_voice:
         | 
| 181 | 
            +
                    payload["voice_id"] = speaker.id
         | 
| 182 | 
            +
                    # else:
         | 
| 183 | 
            +
                    payload["speaker_id"] = speaker.id
         | 
| 184 | 
            +
             | 
| 185 | 
            +
                    if model == "V1":
         | 
| 186 | 
            +
                        payload.update(
         | 
| 187 | 
            +
                            {
         | 
| 188 | 
            +
                                "emotion": emotion,
         | 
| 189 | 
            +
                                "name": speaker.name,
         | 
| 190 | 
            +
                                "text": text,
         | 
| 191 | 
            +
                                "speed": speed,
         | 
| 192 | 
            +
                            }
         | 
| 193 | 
            +
                        )
         | 
| 194 | 
            +
                    elif model == "XTTS":
         | 
| 195 | 
            +
                        payload.update(
         | 
| 196 | 
            +
                            {
         | 
| 197 | 
            +
                                "name": speaker.name,
         | 
| 198 | 
            +
                                "text": text,
         | 
| 199 | 
            +
                                "speed": speed,
         | 
| 200 | 
            +
                                "language": language,
         | 
| 201 | 
            +
                            }
         | 
| 202 | 
            +
                        )
         | 
| 203 | 
            +
                    else:
         | 
| 204 | 
            +
                        raise ValueError(f"❗ Unknown model {model}")
         | 
| 205 | 
            +
                    return payload
         | 
| 206 | 
            +
             | 
| 207 | 
            +
                def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, language):
         | 
| 208 | 
            +
                    assert text is not None, "❗ text is required for V1 model."
         | 
| 209 | 
            +
                    assert speaker_name is not None, "❗ speaker_name is required for V1 model."
         | 
| 210 | 
            +
                    if self.model == "V1":
         | 
| 211 | 
            +
                        if emotion is None:
         | 
| 212 | 
            +
                            emotion = "Neutral"
         | 
| 213 | 
            +
                        assert language is None, "❗ language is not supported for V1 model."
         | 
| 214 | 
            +
                    elif self.model == "XTTS":
         | 
| 215 | 
            +
                        assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
         | 
| 216 | 
            +
                        assert language is not None, "❗ Language is required for XTTS model."
         | 
| 217 | 
            +
                        assert (
         | 
| 218 | 
            +
                            language in self.SUPPORTED_LANGUAGES
         | 
| 219 | 
            +
                        ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
         | 
| 220 | 
            +
                    return text, speaker_name, speaker_id, emotion, speed, language
         | 
| 221 | 
            +
             | 
| 222 | 
            +
                def tts(
         | 
| 223 | 
            +
                    self,
         | 
| 224 | 
            +
                    text: str,
         | 
| 225 | 
            +
                    speaker_name: str = None,
         | 
| 226 | 
            +
                    speaker_id=None,
         | 
| 227 | 
            +
                    emotion=None,
         | 
| 228 | 
            +
                    speed=1.0,
         | 
| 229 | 
            +
                    language=None,  # pylint: disable=unused-argument
         | 
| 230 | 
            +
                ) -> Tuple[np.ndarray, int]:
         | 
| 231 | 
            +
                    """Synthesize speech from text.
         | 
| 232 | 
            +
             | 
| 233 | 
            +
                    Args:
         | 
| 234 | 
            +
                        text (str): Text to synthesize.
         | 
| 235 | 
            +
                        speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
         | 
| 236 | 
            +
                            voices (user generated speakers) with `list_voices()`.
         | 
| 237 | 
            +
                        speaker_id (str): Speaker ID. If None, the speaker name is used.
         | 
| 238 | 
            +
                        emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only
         | 
| 239 | 
            +
                            supported by `V1` model. Defaults to None.
         | 
| 240 | 
            +
                        speed (float): Speed of the speech. 1.0 is normal speed.
         | 
| 241 | 
            +
                        language (str): Language of the text. If None, the default language of the speaker is used. Language is only
         | 
| 242 | 
            +
                            supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
         | 
| 243 | 
            +
                    """
         | 
| 244 | 
            +
                    self._check_token()
         | 
| 245 | 
            +
                    self.ping_api()
         | 
| 246 | 
            +
             | 
| 247 | 
            +
                    if speaker_name is None and speaker_id is None:
         | 
| 248 | 
            +
                        raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
         | 
| 249 | 
            +
                    if speaker_id is None:
         | 
| 250 | 
            +
                        speaker = self.name_to_speaker(speaker_name)
         | 
| 251 | 
            +
                    else:
         | 
| 252 | 
            +
                        speaker = self.id_to_speaker(speaker_id)
         | 
| 253 | 
            +
             | 
| 254 | 
            +
                    text, speaker_name, speaker_id, emotion, speed, language = self._check_tts_args(
         | 
| 255 | 
            +
                        text, speaker_name, speaker_id, emotion, speed, language
         | 
| 256 | 
            +
                    )
         | 
| 257 | 
            +
             | 
| 258 | 
            +
                    conn = http.client.HTTPSConnection("app.coqui.ai")
         | 
| 259 | 
            +
                    payload = self._create_payload(self.model, text, speaker, speed, emotion, language)
         | 
| 260 | 
            +
                    url = self.MODEL_ENDPOINTS[self.model]["synthesize"]
         | 
| 261 | 
            +
                    conn.request("POST", url, json.dumps(payload), self.headers)
         | 
| 262 | 
            +
                    res = conn.getresponse()
         | 
| 263 | 
            +
                    data = res.read()
         | 
| 264 | 
            +
                    try:
         | 
| 265 | 
            +
                        wav, sr = self.url_to_np(json.loads(data)["audio_url"])
         | 
| 266 | 
            +
                    except KeyError as e:
         | 
| 267 | 
            +
                        raise ValueError(f" [!] 🐸 API returned error: {data}") from e
         | 
| 268 | 
            +
                    return wav, sr
         | 
| 269 | 
            +
             | 
| 270 | 
            +
                def tts_to_file(
         | 
| 271 | 
            +
                    self,
         | 
| 272 | 
            +
                    text: str,
         | 
| 273 | 
            +
                    speaker_name: str,
         | 
| 274 | 
            +
                    speaker_id=None,
         | 
| 275 | 
            +
                    emotion=None,
         | 
| 276 | 
            +
                    speed=1.0,
         | 
| 277 | 
            +
                    pipe_out=None,
         | 
| 278 | 
            +
                    language=None,
         | 
| 279 | 
            +
                    file_path: str = None,
         | 
| 280 | 
            +
                ) -> str:
         | 
| 281 | 
            +
                    """Synthesize speech from text and save it to a file.
         | 
| 282 | 
            +
             | 
| 283 | 
            +
                    Args:
         | 
| 284 | 
            +
                        text (str): Text to synthesize.
         | 
| 285 | 
            +
                        speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
         | 
| 286 | 
            +
                            voices (user generated speakers) with `list_voices()`.
         | 
| 287 | 
            +
                        speaker_id (str): Speaker ID. If None, the speaker name is used.
         | 
| 288 | 
            +
                        emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
         | 
| 289 | 
            +
                        speed (float): Speed of the speech. 1.0 is normal speed.
         | 
| 290 | 
            +
                        pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
         | 
| 291 | 
            +
                        language (str): Language of the text. If None, the default language of the speaker is used. Language is only
         | 
| 292 | 
            +
                            supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
         | 
| 293 | 
            +
                        file_path (str): Path to save the file. If None, a temporary file is created.
         | 
| 294 | 
            +
                    """
         | 
| 295 | 
            +
                    if file_path is None:
         | 
| 296 | 
            +
                        file_path = tempfile.mktemp(".wav")
         | 
| 297 | 
            +
                    wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
         | 
| 298 | 
            +
                    save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out)
         | 
| 299 | 
            +
                    return file_path
         | 
| 300 | 
            +
             | 
| 301 | 
            +
             | 
| 302 | 
            +
            if __name__ == "__main__":
         | 
| 303 | 
            +
                import time
         | 
| 304 | 
            +
             | 
| 305 | 
            +
                api = CS_API()
         | 
| 306 | 
            +
                print(api.speakers)
         | 
| 307 | 
            +
                print(api.list_speakers_as_tts_models())
         | 
| 308 | 
            +
             | 
| 309 | 
            +
                ts = time.time()
         | 
| 310 | 
            +
                wav, sr = api.tts(
         | 
| 311 | 
            +
                    "It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name
         | 
| 312 | 
            +
                )
         | 
| 313 | 
            +
                print(f" [i] XTTS took {time.time() - ts:.2f}s")
         | 
| 314 | 
            +
             | 
| 315 | 
            +
                filepath = api.tts_to_file(
         | 
| 316 | 
            +
                    text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav"
         | 
| 317 | 
            +
                )
         | 
    	
        TTS/encoder/README.md
    ADDED
    
    | @@ -0,0 +1,18 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ### Speaker Encoder
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            To run the code, you need to follow the same flow as in TTS.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
         | 
| 16 | 
            +
            - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
         | 
| 17 | 
            +
            - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
         | 
| 18 | 
            +
            - Watch training on Tensorboard as in TTS
         | 
    	
        TTS/encoder/__init__.py
    ADDED
    
    | 
            File without changes
         | 
    	
        TTS/encoder/__pycache__/__init__.cpython-39.pyc
    ADDED
    
    | Binary file (160 Bytes). View file | 
|  | 
    	
        TTS/encoder/__pycache__/losses.cpython-39.pyc
    ADDED
    
    | Binary file (7.83 kB). View file | 
|  | 
    	
        TTS/encoder/configs/base_encoder_config.py
    ADDED
    
    | @@ -0,0 +1,61 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from dataclasses import asdict, dataclass, field
         | 
| 2 | 
            +
            from typing import Dict, List
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            from coqpit import MISSING
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            @dataclass
         | 
| 10 | 
            +
            class BaseEncoderConfig(BaseTrainingConfig):
         | 
| 11 | 
            +
                """Defines parameters for a Generic Encoder model."""
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                model: str = None
         | 
| 14 | 
            +
                audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
         | 
| 15 | 
            +
                datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
         | 
| 16 | 
            +
                # model params
         | 
| 17 | 
            +
                model_params: Dict = field(
         | 
| 18 | 
            +
                    default_factory=lambda: {
         | 
| 19 | 
            +
                        "model_name": "lstm",
         | 
| 20 | 
            +
                        "input_dim": 80,
         | 
| 21 | 
            +
                        "proj_dim": 256,
         | 
| 22 | 
            +
                        "lstm_dim": 768,
         | 
| 23 | 
            +
                        "num_lstm_layers": 3,
         | 
| 24 | 
            +
                        "use_lstm_with_projection": True,
         | 
| 25 | 
            +
                    }
         | 
| 26 | 
            +
                )
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                audio_augmentation: Dict = field(default_factory=lambda: {})
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                # training params
         | 
| 31 | 
            +
                epochs: int = 10000
         | 
| 32 | 
            +
                loss: str = "angleproto"
         | 
| 33 | 
            +
                grad_clip: float = 3.0
         | 
| 34 | 
            +
                lr: float = 0.0001
         | 
| 35 | 
            +
                optimizer: str = "radam"
         | 
| 36 | 
            +
                optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
         | 
| 37 | 
            +
                lr_decay: bool = False
         | 
| 38 | 
            +
                warmup_steps: int = 4000
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                # logging params
         | 
| 41 | 
            +
                tb_model_param_stats: bool = False
         | 
| 42 | 
            +
                steps_plot_stats: int = 10
         | 
| 43 | 
            +
                save_step: int = 1000
         | 
| 44 | 
            +
                print_step: int = 20
         | 
| 45 | 
            +
                run_eval: bool = False
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                # data loader
         | 
| 48 | 
            +
                num_classes_in_batch: int = MISSING
         | 
| 49 | 
            +
                num_utter_per_class: int = MISSING
         | 
| 50 | 
            +
                eval_num_classes_in_batch: int = None
         | 
| 51 | 
            +
                eval_num_utter_per_class: int = None
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                num_loader_workers: int = MISSING
         | 
| 54 | 
            +
                voice_len: float = 1.6
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                def check_values(self):
         | 
| 57 | 
            +
                    super().check_values()
         | 
| 58 | 
            +
                    c = asdict(self)
         | 
| 59 | 
            +
                    assert (
         | 
| 60 | 
            +
                        c["model_params"]["input_dim"] == self.audio.num_mels
         | 
| 61 | 
            +
                    ), " [!] model input dimendion must be equal to melspectrogram dimension."
         | 
    	
        TTS/encoder/configs/emotion_encoder_config.py
    ADDED
    
    | @@ -0,0 +1,12 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from dataclasses import asdict, dataclass
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
         | 
| 4 | 
            +
             | 
| 5 | 
            +
             | 
| 6 | 
            +
            @dataclass
         | 
| 7 | 
            +
            class EmotionEncoderConfig(BaseEncoderConfig):
         | 
| 8 | 
            +
                """Defines parameters for Emotion Encoder model."""
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                model: str = "emotion_encoder"
         | 
| 11 | 
            +
                map_classid_to_classname: dict = None
         | 
| 12 | 
            +
                class_name_key: str = "emotion_name"
         | 
    	
        TTS/encoder/configs/speaker_encoder_config.py
    ADDED
    
    | @@ -0,0 +1,11 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from dataclasses import asdict, dataclass
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
         | 
| 4 | 
            +
             | 
| 5 | 
            +
             | 
| 6 | 
            +
            @dataclass
         | 
| 7 | 
            +
            class SpeakerEncoderConfig(BaseEncoderConfig):
         | 
| 8 | 
            +
                """Defines parameters for Speaker Encoder model."""
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                model: str = "speaker_encoder"
         | 
| 11 | 
            +
                class_name_key: str = "speaker_name"
         | 
    	
        TTS/encoder/dataset.py
    ADDED
    
    | @@ -0,0 +1,147 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import random
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            import torch
         | 
| 4 | 
            +
            from torch.utils.data import Dataset
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from TTS.encoder.utils.generic_utils import AugmentWAV
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            class EncoderDataset(Dataset):
         | 
| 10 | 
            +
                def __init__(
         | 
| 11 | 
            +
                    self,
         | 
| 12 | 
            +
                    config,
         | 
| 13 | 
            +
                    ap,
         | 
| 14 | 
            +
                    meta_data,
         | 
| 15 | 
            +
                    voice_len=1.6,
         | 
| 16 | 
            +
                    num_classes_in_batch=64,
         | 
| 17 | 
            +
                    num_utter_per_class=10,
         | 
| 18 | 
            +
                    verbose=False,
         | 
| 19 | 
            +
                    augmentation_config=None,
         | 
| 20 | 
            +
                    use_torch_spec=None,
         | 
| 21 | 
            +
                ):
         | 
| 22 | 
            +
                    """
         | 
| 23 | 
            +
                    Args:
         | 
| 24 | 
            +
                        ap (TTS.tts.utils.AudioProcessor): audio processor object.
         | 
| 25 | 
            +
                        meta_data (list): list of dataset instances.
         | 
| 26 | 
            +
                        seq_len (int): voice segment length in seconds.
         | 
| 27 | 
            +
                        verbose (bool): print diagnostic information.
         | 
| 28 | 
            +
                    """
         | 
| 29 | 
            +
                    super().__init__()
         | 
| 30 | 
            +
                    self.config = config
         | 
| 31 | 
            +
                    self.items = meta_data
         | 
| 32 | 
            +
                    self.sample_rate = ap.sample_rate
         | 
| 33 | 
            +
                    self.seq_len = int(voice_len * self.sample_rate)
         | 
| 34 | 
            +
                    self.num_utter_per_class = num_utter_per_class
         | 
| 35 | 
            +
                    self.ap = ap
         | 
| 36 | 
            +
                    self.verbose = verbose
         | 
| 37 | 
            +
                    self.use_torch_spec = use_torch_spec
         | 
| 38 | 
            +
                    self.classes, self.items = self.__parse_items()
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                    self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                    # Data Augmentation
         | 
| 43 | 
            +
                    self.augmentator = None
         | 
| 44 | 
            +
                    self.gaussian_augmentation_config = None
         | 
| 45 | 
            +
                    if augmentation_config:
         | 
| 46 | 
            +
                        self.data_augmentation_p = augmentation_config["p"]
         | 
| 47 | 
            +
                        if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
         | 
| 48 | 
            +
                            self.augmentator = AugmentWAV(ap, augmentation_config)
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                        if "gaussian" in augmentation_config.keys():
         | 
| 51 | 
            +
                            self.gaussian_augmentation_config = augmentation_config["gaussian"]
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                    if self.verbose:
         | 
| 54 | 
            +
                        print("\n > DataLoader initialization")
         | 
| 55 | 
            +
                        print(f" | > Classes per Batch: {num_classes_in_batch}")
         | 
| 56 | 
            +
                        print(f" | > Number of instances : {len(self.items)}")
         | 
| 57 | 
            +
                        print(f" | > Sequence length: {self.seq_len}")
         | 
| 58 | 
            +
                        print(f" | > Num Classes: {len(self.classes)}")
         | 
| 59 | 
            +
                        print(f" | > Classes: {self.classes}")
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                def load_wav(self, filename):
         | 
| 62 | 
            +
                    audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
         | 
| 63 | 
            +
                    return audio
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                def __parse_items(self):
         | 
| 66 | 
            +
                    class_to_utters = {}
         | 
| 67 | 
            +
                    for item in self.items:
         | 
| 68 | 
            +
                        path_ = item["audio_file"]
         | 
| 69 | 
            +
                        class_name = item[self.config.class_name_key]
         | 
| 70 | 
            +
                        if class_name in class_to_utters.keys():
         | 
| 71 | 
            +
                            class_to_utters[class_name].append(path_)
         | 
| 72 | 
            +
                        else:
         | 
| 73 | 
            +
                            class_to_utters[class_name] = [
         | 
| 74 | 
            +
                                path_,
         | 
| 75 | 
            +
                            ]
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                    # skip classes with number of samples >= self.num_utter_per_class
         | 
| 78 | 
            +
                    class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                    classes = list(class_to_utters.keys())
         | 
| 81 | 
            +
                    classes.sort()
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                    new_items = []
         | 
| 84 | 
            +
                    for item in self.items:
         | 
| 85 | 
            +
                        path_ = item["audio_file"]
         | 
| 86 | 
            +
                        class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
         | 
| 87 | 
            +
                        # ignore filtered classes
         | 
| 88 | 
            +
                        if class_name not in classes:
         | 
| 89 | 
            +
                            continue
         | 
| 90 | 
            +
                        # ignore small audios
         | 
| 91 | 
            +
                        if self.load_wav(path_).shape[0] - self.seq_len <= 0:
         | 
| 92 | 
            +
                            continue
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                        new_items.append({"wav_file_path": path_, "class_name": class_name})
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                    return classes, new_items
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                def __len__(self):
         | 
| 99 | 
            +
                    return len(self.items)
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                def get_num_classes(self):
         | 
| 102 | 
            +
                    return len(self.classes)
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                def get_class_list(self):
         | 
| 105 | 
            +
                    return self.classes
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                def set_classes(self, classes):
         | 
| 108 | 
            +
                    self.classes = classes
         | 
| 109 | 
            +
                    self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                def get_map_classid_to_classname(self):
         | 
| 112 | 
            +
                    return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
         | 
| 113 | 
            +
             | 
| 114 | 
            +
                def __getitem__(self, idx):
         | 
| 115 | 
            +
                    return self.items[idx]
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                def collate_fn(self, batch):
         | 
| 118 | 
            +
                    # get the batch class_ids
         | 
| 119 | 
            +
                    labels = []
         | 
| 120 | 
            +
                    feats = []
         | 
| 121 | 
            +
                    for item in batch:
         | 
| 122 | 
            +
                        utter_path = item["wav_file_path"]
         | 
| 123 | 
            +
                        class_name = item["class_name"]
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                        # get classid
         | 
| 126 | 
            +
                        class_id = self.classname_to_classid[class_name]
         | 
| 127 | 
            +
                        # load wav file
         | 
| 128 | 
            +
                        wav = self.load_wav(utter_path)
         | 
| 129 | 
            +
                        offset = random.randint(0, wav.shape[0] - self.seq_len)
         | 
| 130 | 
            +
                        wav = wav[offset : offset + self.seq_len]
         | 
| 131 | 
            +
             | 
| 132 | 
            +
                        if self.augmentator is not None and self.data_augmentation_p:
         | 
| 133 | 
            +
                            if random.random() < self.data_augmentation_p:
         | 
| 134 | 
            +
                                wav = self.augmentator.apply_one(wav)
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                        if not self.use_torch_spec:
         | 
| 137 | 
            +
                            mel = self.ap.melspectrogram(wav)
         | 
| 138 | 
            +
                            feats.append(torch.FloatTensor(mel))
         | 
| 139 | 
            +
                        else:
         | 
| 140 | 
            +
                            feats.append(torch.FloatTensor(wav))
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                        labels.append(class_id)
         | 
| 143 | 
            +
             | 
| 144 | 
            +
                    feats = torch.stack(feats)
         | 
| 145 | 
            +
                    labels = torch.LongTensor(labels)
         | 
| 146 | 
            +
             | 
| 147 | 
            +
                    return feats, labels
         | 
    	
        TTS/encoder/losses.py
    ADDED
    
    | @@ -0,0 +1,226 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import torch
         | 
| 2 | 
            +
            import torch.nn.functional as F
         | 
| 3 | 
            +
            from torch import nn
         | 
| 4 | 
            +
             | 
| 5 | 
            +
             | 
| 6 | 
            +
            # adapted from https://github.com/cvqluu/GE2E-Loss
         | 
| 7 | 
            +
            class GE2ELoss(nn.Module):
         | 
| 8 | 
            +
                def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
         | 
| 9 | 
            +
                    """
         | 
| 10 | 
            +
                    Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
         | 
| 11 | 
            +
                    Accepts an input of size (N, M, D)
         | 
| 12 | 
            +
                        where N is the number of speakers in the batch,
         | 
| 13 | 
            +
                        M is the number of utterances per speaker,
         | 
| 14 | 
            +
                        and D is the dimensionality of the embedding vector (e.g. d-vector)
         | 
| 15 | 
            +
                    Args:
         | 
| 16 | 
            +
                        - init_w (float): defines the initial value of w in Equation (5) of [1]
         | 
| 17 | 
            +
                        - init_b (float): definies the initial value of b in Equation (5) of [1]
         | 
| 18 | 
            +
                    """
         | 
| 19 | 
            +
                    super().__init__()
         | 
| 20 | 
            +
                    # pylint: disable=E1102
         | 
| 21 | 
            +
                    self.w = nn.Parameter(torch.tensor(init_w))
         | 
| 22 | 
            +
                    # pylint: disable=E1102
         | 
| 23 | 
            +
                    self.b = nn.Parameter(torch.tensor(init_b))
         | 
| 24 | 
            +
                    self.loss_method = loss_method
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                    print(" > Initialized Generalized End-to-End loss")
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    assert self.loss_method in ["softmax", "contrast"]
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    if self.loss_method == "softmax":
         | 
| 31 | 
            +
                        self.embed_loss = self.embed_loss_softmax
         | 
| 32 | 
            +
                    if self.loss_method == "contrast":
         | 
| 33 | 
            +
                        self.embed_loss = self.embed_loss_contrast
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                # pylint: disable=R0201
         | 
| 36 | 
            +
                def calc_new_centroids(self, dvecs, centroids, spkr, utt):
         | 
| 37 | 
            +
                    """
         | 
| 38 | 
            +
                    Calculates the new centroids excluding the reference utterance
         | 
| 39 | 
            +
                    """
         | 
| 40 | 
            +
                    excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
         | 
| 41 | 
            +
                    excl = torch.mean(excl, 0)
         | 
| 42 | 
            +
                    new_centroids = []
         | 
| 43 | 
            +
                    for i, centroid in enumerate(centroids):
         | 
| 44 | 
            +
                        if i == spkr:
         | 
| 45 | 
            +
                            new_centroids.append(excl)
         | 
| 46 | 
            +
                        else:
         | 
| 47 | 
            +
                            new_centroids.append(centroid)
         | 
| 48 | 
            +
                    return torch.stack(new_centroids)
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                def calc_cosine_sim(self, dvecs, centroids):
         | 
| 51 | 
            +
                    """
         | 
| 52 | 
            +
                    Make the cosine similarity matrix with dims (N,M,N)
         | 
| 53 | 
            +
                    """
         | 
| 54 | 
            +
                    cos_sim_matrix = []
         | 
| 55 | 
            +
                    for spkr_idx, speaker in enumerate(dvecs):
         | 
| 56 | 
            +
                        cs_row = []
         | 
| 57 | 
            +
                        for utt_idx, utterance in enumerate(speaker):
         | 
| 58 | 
            +
                            new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
         | 
| 59 | 
            +
                            # vector based cosine similarity for speed
         | 
| 60 | 
            +
                            cs_row.append(
         | 
| 61 | 
            +
                                torch.clamp(
         | 
| 62 | 
            +
                                    torch.mm(
         | 
| 63 | 
            +
                                        utterance.unsqueeze(1).transpose(0, 1),
         | 
| 64 | 
            +
                                        new_centroids.transpose(0, 1),
         | 
| 65 | 
            +
                                    )
         | 
| 66 | 
            +
                                    / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
         | 
| 67 | 
            +
                                    1e-6,
         | 
| 68 | 
            +
                                )
         | 
| 69 | 
            +
                            )
         | 
| 70 | 
            +
                        cs_row = torch.cat(cs_row, dim=0)
         | 
| 71 | 
            +
                        cos_sim_matrix.append(cs_row)
         | 
| 72 | 
            +
                    return torch.stack(cos_sim_matrix)
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                # pylint: disable=R0201
         | 
| 75 | 
            +
                def embed_loss_softmax(self, dvecs, cos_sim_matrix):
         | 
| 76 | 
            +
                    """
         | 
| 77 | 
            +
                    Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
         | 
| 78 | 
            +
                    """
         | 
| 79 | 
            +
                    N, M, _ = dvecs.shape
         | 
| 80 | 
            +
                    L = []
         | 
| 81 | 
            +
                    for j in range(N):
         | 
| 82 | 
            +
                        L_row = []
         | 
| 83 | 
            +
                        for i in range(M):
         | 
| 84 | 
            +
                            L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
         | 
| 85 | 
            +
                        L_row = torch.stack(L_row)
         | 
| 86 | 
            +
                        L.append(L_row)
         | 
| 87 | 
            +
                    return torch.stack(L)
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                # pylint: disable=R0201
         | 
| 90 | 
            +
                def embed_loss_contrast(self, dvecs, cos_sim_matrix):
         | 
| 91 | 
            +
                    """
         | 
| 92 | 
            +
                    Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
         | 
| 93 | 
            +
                    """
         | 
| 94 | 
            +
                    N, M, _ = dvecs.shape
         | 
| 95 | 
            +
                    L = []
         | 
| 96 | 
            +
                    for j in range(N):
         | 
| 97 | 
            +
                        L_row = []
         | 
| 98 | 
            +
                        for i in range(M):
         | 
| 99 | 
            +
                            centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
         | 
| 100 | 
            +
                            excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]))
         | 
| 101 | 
            +
                            L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
         | 
| 102 | 
            +
                        L_row = torch.stack(L_row)
         | 
| 103 | 
            +
                        L.append(L_row)
         | 
| 104 | 
            +
                    return torch.stack(L)
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                def forward(self, x, _label=None):
         | 
| 107 | 
            +
                    """
         | 
| 108 | 
            +
                    Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
         | 
| 109 | 
            +
                    """
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                    assert x.size()[1] >= 2
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                    centroids = torch.mean(x, 1)
         | 
| 114 | 
            +
                    cos_sim_matrix = self.calc_cosine_sim(x, centroids)
         | 
| 115 | 
            +
                    torch.clamp(self.w, 1e-6)
         | 
| 116 | 
            +
                    cos_sim_matrix = self.w * cos_sim_matrix + self.b
         | 
| 117 | 
            +
                    L = self.embed_loss(x, cos_sim_matrix)
         | 
| 118 | 
            +
                    return L.mean()
         | 
| 119 | 
            +
             | 
| 120 | 
            +
             | 
| 121 | 
            +
            # adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
         | 
| 122 | 
            +
            class AngleProtoLoss(nn.Module):
         | 
| 123 | 
            +
                """
         | 
| 124 | 
            +
                Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
         | 
| 125 | 
            +
                    Accepts an input of size (N, M, D)
         | 
| 126 | 
            +
                        where N is the number of speakers in the batch,
         | 
| 127 | 
            +
                        M is the number of utterances per speaker,
         | 
| 128 | 
            +
                        and D is the dimensionality of the embedding vector
         | 
| 129 | 
            +
                    Args:
         | 
| 130 | 
            +
                        - init_w (float): defines the initial value of w
         | 
| 131 | 
            +
                        - init_b (float): definies the initial value of b
         | 
| 132 | 
            +
                """
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                def __init__(self, init_w=10.0, init_b=-5.0):
         | 
| 135 | 
            +
                    super().__init__()
         | 
| 136 | 
            +
                    # pylint: disable=E1102
         | 
| 137 | 
            +
                    self.w = nn.Parameter(torch.tensor(init_w))
         | 
| 138 | 
            +
                    # pylint: disable=E1102
         | 
| 139 | 
            +
                    self.b = nn.Parameter(torch.tensor(init_b))
         | 
| 140 | 
            +
                    self.criterion = torch.nn.CrossEntropyLoss()
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                    print(" > Initialized Angular Prototypical loss")
         | 
| 143 | 
            +
             | 
| 144 | 
            +
                def forward(self, x, _label=None):
         | 
| 145 | 
            +
                    """
         | 
| 146 | 
            +
                    Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
         | 
| 147 | 
            +
                    """
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                    assert x.size()[1] >= 2
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                    out_anchor = torch.mean(x[:, 1:, :], 1)
         | 
| 152 | 
            +
                    out_positive = x[:, 0, :]
         | 
| 153 | 
            +
                    num_speakers = out_anchor.size()[0]
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                    cos_sim_matrix = F.cosine_similarity(
         | 
| 156 | 
            +
                        out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),
         | 
| 157 | 
            +
                        out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2),
         | 
| 158 | 
            +
                    )
         | 
| 159 | 
            +
                    torch.clamp(self.w, 1e-6)
         | 
| 160 | 
            +
                    cos_sim_matrix = cos_sim_matrix * self.w + self.b
         | 
| 161 | 
            +
                    label = torch.arange(num_speakers).to(cos_sim_matrix.device)
         | 
| 162 | 
            +
                    L = self.criterion(cos_sim_matrix, label)
         | 
| 163 | 
            +
                    return L
         | 
| 164 | 
            +
             | 
| 165 | 
            +
             | 
| 166 | 
            +
            class SoftmaxLoss(nn.Module):
         | 
| 167 | 
            +
                """
         | 
| 168 | 
            +
                Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
         | 
| 169 | 
            +
                    Args:
         | 
| 170 | 
            +
                        - embedding_dim (float): speaker embedding dim
         | 
| 171 | 
            +
                        - n_speakers (float): number of speakers
         | 
| 172 | 
            +
                """
         | 
| 173 | 
            +
             | 
| 174 | 
            +
                def __init__(self, embedding_dim, n_speakers):
         | 
| 175 | 
            +
                    super().__init__()
         | 
| 176 | 
            +
             | 
| 177 | 
            +
                    self.criterion = torch.nn.CrossEntropyLoss()
         | 
| 178 | 
            +
                    self.fc = nn.Linear(embedding_dim, n_speakers)
         | 
| 179 | 
            +
             | 
| 180 | 
            +
                    print("Initialised Softmax Loss")
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                def forward(self, x, label=None):
         | 
| 183 | 
            +
                    # reshape for compatibility
         | 
| 184 | 
            +
                    x = x.reshape(-1, x.size()[-1])
         | 
| 185 | 
            +
                    label = label.reshape(-1)
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                    x = self.fc(x)
         | 
| 188 | 
            +
                    L = self.criterion(x, label)
         | 
| 189 | 
            +
             | 
| 190 | 
            +
                    return L
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                def inference(self, embedding):
         | 
| 193 | 
            +
                    x = self.fc(embedding)
         | 
| 194 | 
            +
                    activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
         | 
| 195 | 
            +
                    class_id = torch.argmax(activations)
         | 
| 196 | 
            +
                    return class_id
         | 
| 197 | 
            +
             | 
| 198 | 
            +
             | 
| 199 | 
            +
            class SoftmaxAngleProtoLoss(nn.Module):
         | 
| 200 | 
            +
                """
         | 
| 201 | 
            +
                Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
         | 
| 202 | 
            +
                    Args:
         | 
| 203 | 
            +
                        - embedding_dim (float): speaker embedding dim
         | 
| 204 | 
            +
                        - n_speakers (float): number of speakers
         | 
| 205 | 
            +
                        - init_w (float): defines the initial value of w
         | 
| 206 | 
            +
                        - init_b (float): definies the initial value of b
         | 
| 207 | 
            +
                """
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
         | 
| 210 | 
            +
                    super().__init__()
         | 
| 211 | 
            +
             | 
| 212 | 
            +
                    self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
         | 
| 213 | 
            +
                    self.angleproto = AngleProtoLoss(init_w, init_b)
         | 
| 214 | 
            +
             | 
| 215 | 
            +
                    print("Initialised SoftmaxAnglePrototypical Loss")
         | 
| 216 | 
            +
             | 
| 217 | 
            +
                def forward(self, x, label=None):
         | 
| 218 | 
            +
                    """
         | 
| 219 | 
            +
                    Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
         | 
| 220 | 
            +
                    """
         | 
| 221 | 
            +
             | 
| 222 | 
            +
                    Lp = self.angleproto(x)
         | 
| 223 | 
            +
             | 
| 224 | 
            +
                    Ls = self.softmax(x, label)
         | 
| 225 | 
            +
             | 
| 226 | 
            +
                    return Ls + Lp
         | 
    	
        TTS/encoder/models/__pycache__/base_encoder.cpython-39.pyc
    ADDED
    
    | Binary file (4.52 kB). View file | 
|  | 
    	
        TTS/encoder/models/__pycache__/lstm.cpython-39.pyc
    ADDED
    
    | Binary file (3.62 kB). View file | 
|  | 
    	
        TTS/encoder/models/__pycache__/resnet.cpython-39.pyc
    ADDED
    
    | Binary file (5.84 kB). View file | 
|  | 
    	
        TTS/encoder/models/base_encoder.py
    ADDED
    
    | @@ -0,0 +1,161 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import numpy as np
         | 
| 2 | 
            +
            import torch
         | 
| 3 | 
            +
            import torchaudio
         | 
| 4 | 
            +
            from coqpit import Coqpit
         | 
| 5 | 
            +
            from torch import nn
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
         | 
| 8 | 
            +
            from TTS.utils.generic_utils import set_init_dict
         | 
| 9 | 
            +
            from TTS.utils.io import load_fsspec
         | 
| 10 | 
            +
             | 
| 11 | 
            +
             | 
| 12 | 
            +
            class PreEmphasis(nn.Module):
         | 
| 13 | 
            +
                def __init__(self, coefficient=0.97):
         | 
| 14 | 
            +
                    super().__init__()
         | 
| 15 | 
            +
                    self.coefficient = coefficient
         | 
| 16 | 
            +
                    self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                def forward(self, x):
         | 
| 19 | 
            +
                    assert len(x.size()) == 2
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                    x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
         | 
| 22 | 
            +
                    return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
         | 
| 23 | 
            +
             | 
| 24 | 
            +
             | 
| 25 | 
            +
            class BaseEncoder(nn.Module):
         | 
| 26 | 
            +
                """Base `encoder` class. Every new `encoder` model must inherit this.
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                It defines common `encoder` specific functions.
         | 
| 29 | 
            +
                """
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                # pylint: disable=W0102
         | 
| 32 | 
            +
                def __init__(self):
         | 
| 33 | 
            +
                    super(BaseEncoder, self).__init__()
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                def get_torch_mel_spectrogram_class(self, audio_config):
         | 
| 36 | 
            +
                    return torch.nn.Sequential(
         | 
| 37 | 
            +
                        PreEmphasis(audio_config["preemphasis"]),
         | 
| 38 | 
            +
                        # TorchSTFT(
         | 
| 39 | 
            +
                        #     n_fft=audio_config["fft_size"],
         | 
| 40 | 
            +
                        #     hop_length=audio_config["hop_length"],
         | 
| 41 | 
            +
                        #     win_length=audio_config["win_length"],
         | 
| 42 | 
            +
                        #     sample_rate=audio_config["sample_rate"],
         | 
| 43 | 
            +
                        #     window="hamming_window",
         | 
| 44 | 
            +
                        #     mel_fmin=0.0,
         | 
| 45 | 
            +
                        #     mel_fmax=None,
         | 
| 46 | 
            +
                        #     use_htk=True,
         | 
| 47 | 
            +
                        #     do_amp_to_db=False,
         | 
| 48 | 
            +
                        #     n_mels=audio_config["num_mels"],
         | 
| 49 | 
            +
                        #     power=2.0,
         | 
| 50 | 
            +
                        #     use_mel=True,
         | 
| 51 | 
            +
                        #     mel_norm=None,
         | 
| 52 | 
            +
                        # )
         | 
| 53 | 
            +
                        torchaudio.transforms.MelSpectrogram(
         | 
| 54 | 
            +
                            sample_rate=audio_config["sample_rate"],
         | 
| 55 | 
            +
                            n_fft=audio_config["fft_size"],
         | 
| 56 | 
            +
                            win_length=audio_config["win_length"],
         | 
| 57 | 
            +
                            hop_length=audio_config["hop_length"],
         | 
| 58 | 
            +
                            window_fn=torch.hamming_window,
         | 
| 59 | 
            +
                            n_mels=audio_config["num_mels"],
         | 
| 60 | 
            +
                        ),
         | 
| 61 | 
            +
                    )
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                @torch.no_grad()
         | 
| 64 | 
            +
                def inference(self, x, l2_norm=True):
         | 
| 65 | 
            +
                    return self.forward(x, l2_norm)
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                @torch.no_grad()
         | 
| 68 | 
            +
                def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
         | 
| 69 | 
            +
                    """
         | 
| 70 | 
            +
                    Generate embeddings for a batch of utterances
         | 
| 71 | 
            +
                    x: 1xTxD
         | 
| 72 | 
            +
                    """
         | 
| 73 | 
            +
                    # map to the waveform size
         | 
| 74 | 
            +
                    if self.use_torch_spec:
         | 
| 75 | 
            +
                        num_frames = num_frames * self.audio_config["hop_length"]
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                    max_len = x.shape[1]
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                    if max_len < num_frames:
         | 
| 80 | 
            +
                        num_frames = max_len
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                    offsets = np.linspace(0, max_len - num_frames, num=num_eval)
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                    frames_batch = []
         | 
| 85 | 
            +
                    for offset in offsets:
         | 
| 86 | 
            +
                        offset = int(offset)
         | 
| 87 | 
            +
                        end_offset = int(offset + num_frames)
         | 
| 88 | 
            +
                        frames = x[:, offset:end_offset]
         | 
| 89 | 
            +
                        frames_batch.append(frames)
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                    frames_batch = torch.cat(frames_batch, dim=0)
         | 
| 92 | 
            +
                    embeddings = self.inference(frames_batch, l2_norm=l2_norm)
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                    if return_mean:
         | 
| 95 | 
            +
                        embeddings = torch.mean(embeddings, dim=0, keepdim=True)
         | 
| 96 | 
            +
                    return embeddings
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                def get_criterion(self, c: Coqpit, num_classes=None):
         | 
| 99 | 
            +
                    if c.loss == "ge2e":
         | 
| 100 | 
            +
                        criterion = GE2ELoss(loss_method="softmax")
         | 
| 101 | 
            +
                    elif c.loss == "angleproto":
         | 
| 102 | 
            +
                        criterion = AngleProtoLoss()
         | 
| 103 | 
            +
                    elif c.loss == "softmaxproto":
         | 
| 104 | 
            +
                        criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
         | 
| 105 | 
            +
                    else:
         | 
| 106 | 
            +
                        raise Exception("The %s  not is a loss supported" % c.loss)
         | 
| 107 | 
            +
                    return criterion
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                def load_checkpoint(
         | 
| 110 | 
            +
                    self,
         | 
| 111 | 
            +
                    config: Coqpit,
         | 
| 112 | 
            +
                    checkpoint_path: str,
         | 
| 113 | 
            +
                    eval: bool = False,
         | 
| 114 | 
            +
                    use_cuda: bool = False,
         | 
| 115 | 
            +
                    criterion=None,
         | 
| 116 | 
            +
                    cache=False,
         | 
| 117 | 
            +
                ):
         | 
| 118 | 
            +
                    state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         | 
| 119 | 
            +
                    try:
         | 
| 120 | 
            +
                        self.load_state_dict(state["model"])
         | 
| 121 | 
            +
                        print(" > Model fully restored. ")
         | 
| 122 | 
            +
                    except (KeyError, RuntimeError) as error:
         | 
| 123 | 
            +
                        # If eval raise the error
         | 
| 124 | 
            +
                        if eval:
         | 
| 125 | 
            +
                            raise error
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                        print(" > Partial model initialization.")
         | 
| 128 | 
            +
                        model_dict = self.state_dict()
         | 
| 129 | 
            +
                        model_dict = set_init_dict(model_dict, state["model"], c)
         | 
| 130 | 
            +
                        self.load_state_dict(model_dict)
         | 
| 131 | 
            +
                        del model_dict
         | 
| 132 | 
            +
             | 
| 133 | 
            +
                    # load the criterion for restore_path
         | 
| 134 | 
            +
                    if criterion is not None and "criterion" in state:
         | 
| 135 | 
            +
                        try:
         | 
| 136 | 
            +
                            criterion.load_state_dict(state["criterion"])
         | 
| 137 | 
            +
                        except (KeyError, RuntimeError) as error:
         | 
| 138 | 
            +
                            print(" > Criterion load ignored because of:", error)
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                    # instance and load the criterion for the encoder classifier in inference time
         | 
| 141 | 
            +
                    if (
         | 
| 142 | 
            +
                        eval
         | 
| 143 | 
            +
                        and criterion is None
         | 
| 144 | 
            +
                        and "criterion" in state
         | 
| 145 | 
            +
                        and getattr(config, "map_classid_to_classname", None) is not None
         | 
| 146 | 
            +
                    ):
         | 
| 147 | 
            +
                        criterion = self.get_criterion(config, len(config.map_classid_to_classname))
         | 
| 148 | 
            +
                        criterion.load_state_dict(state["criterion"])
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                    if use_cuda:
         | 
| 151 | 
            +
                        self.cuda()
         | 
| 152 | 
            +
                        if criterion is not None:
         | 
| 153 | 
            +
                            criterion = criterion.cuda()
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                    if eval:
         | 
| 156 | 
            +
                        self.eval()
         | 
| 157 | 
            +
                        assert not self.training
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                    if not eval:
         | 
| 160 | 
            +
                        return criterion, state["step"]
         | 
| 161 | 
            +
                    return criterion
         | 
    	
        TTS/encoder/models/lstm.py
    ADDED
    
    | @@ -0,0 +1,99 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import torch
         | 
| 2 | 
            +
            from torch import nn
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            from TTS.encoder.models.base_encoder import BaseEncoder
         | 
| 5 | 
            +
             | 
| 6 | 
            +
             | 
| 7 | 
            +
            class LSTMWithProjection(nn.Module):
         | 
| 8 | 
            +
                def __init__(self, input_size, hidden_size, proj_size):
         | 
| 9 | 
            +
                    super().__init__()
         | 
| 10 | 
            +
                    self.input_size = input_size
         | 
| 11 | 
            +
                    self.hidden_size = hidden_size
         | 
| 12 | 
            +
                    self.proj_size = proj_size
         | 
| 13 | 
            +
                    self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
         | 
| 14 | 
            +
                    self.linear = nn.Linear(hidden_size, proj_size, bias=False)
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                def forward(self, x):
         | 
| 17 | 
            +
                    self.lstm.flatten_parameters()
         | 
| 18 | 
            +
                    o, (_, _) = self.lstm(x)
         | 
| 19 | 
            +
                    return self.linear(o)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
             | 
| 22 | 
            +
            class LSTMWithoutProjection(nn.Module):
         | 
| 23 | 
            +
                def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
         | 
| 24 | 
            +
                    super().__init__()
         | 
| 25 | 
            +
                    self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
         | 
| 26 | 
            +
                    self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
         | 
| 27 | 
            +
                    self.relu = nn.ReLU()
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                def forward(self, x):
         | 
| 30 | 
            +
                    _, (hidden, _) = self.lstm(x)
         | 
| 31 | 
            +
                    return self.relu(self.linear(hidden[-1]))
         | 
| 32 | 
            +
             | 
| 33 | 
            +
             | 
| 34 | 
            +
            class LSTMSpeakerEncoder(BaseEncoder):
         | 
| 35 | 
            +
                def __init__(
         | 
| 36 | 
            +
                    self,
         | 
| 37 | 
            +
                    input_dim,
         | 
| 38 | 
            +
                    proj_dim=256,
         | 
| 39 | 
            +
                    lstm_dim=768,
         | 
| 40 | 
            +
                    num_lstm_layers=3,
         | 
| 41 | 
            +
                    use_lstm_with_projection=True,
         | 
| 42 | 
            +
                    use_torch_spec=False,
         | 
| 43 | 
            +
                    audio_config=None,
         | 
| 44 | 
            +
                ):
         | 
| 45 | 
            +
                    super().__init__()
         | 
| 46 | 
            +
                    self.use_lstm_with_projection = use_lstm_with_projection
         | 
| 47 | 
            +
                    self.use_torch_spec = use_torch_spec
         | 
| 48 | 
            +
                    self.audio_config = audio_config
         | 
| 49 | 
            +
                    self.proj_dim = proj_dim
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                    layers = []
         | 
| 52 | 
            +
                    # choise LSTM layer
         | 
| 53 | 
            +
                    if use_lstm_with_projection:
         | 
| 54 | 
            +
                        layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
         | 
| 55 | 
            +
                        for _ in range(num_lstm_layers - 1):
         | 
| 56 | 
            +
                            layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
         | 
| 57 | 
            +
                        self.layers = nn.Sequential(*layers)
         | 
| 58 | 
            +
                    else:
         | 
| 59 | 
            +
                        self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                    self.instancenorm = nn.InstanceNorm1d(input_dim)
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                    if self.use_torch_spec:
         | 
| 64 | 
            +
                        self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
         | 
| 65 | 
            +
                    else:
         | 
| 66 | 
            +
                        self.torch_spec = None
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                    self._init_layers()
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                def _init_layers(self):
         | 
| 71 | 
            +
                    for name, param in self.layers.named_parameters():
         | 
| 72 | 
            +
                        if "bias" in name:
         | 
| 73 | 
            +
                            nn.init.constant_(param, 0.0)
         | 
| 74 | 
            +
                        elif "weight" in name:
         | 
| 75 | 
            +
                            nn.init.xavier_normal_(param)
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                def forward(self, x, l2_norm=True):
         | 
| 78 | 
            +
                    """Forward pass of the model.
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                    Args:
         | 
| 81 | 
            +
                        x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
         | 
| 82 | 
            +
                            to compute the spectrogram on-the-fly.
         | 
| 83 | 
            +
                        l2_norm (bool): Whether to L2-normalize the outputs.
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                    Shapes:
         | 
| 86 | 
            +
                        - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
         | 
| 87 | 
            +
                    """
         | 
| 88 | 
            +
                    with torch.no_grad():
         | 
| 89 | 
            +
                        with torch.cuda.amp.autocast(enabled=False):
         | 
| 90 | 
            +
                            if self.use_torch_spec:
         | 
| 91 | 
            +
                                x.squeeze_(1)
         | 
| 92 | 
            +
                                x = self.torch_spec(x)
         | 
| 93 | 
            +
                            x = self.instancenorm(x).transpose(1, 2)
         | 
| 94 | 
            +
                    d = self.layers(x)
         | 
| 95 | 
            +
                    if self.use_lstm_with_projection:
         | 
| 96 | 
            +
                        d = d[:, -1]
         | 
| 97 | 
            +
                    if l2_norm:
         | 
| 98 | 
            +
                        d = torch.nn.functional.normalize(d, p=2, dim=1)
         | 
| 99 | 
            +
                    return d
         | 
    	
        TTS/encoder/models/resnet.py
    ADDED
    
    | @@ -0,0 +1,198 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import torch
         | 
| 2 | 
            +
            from torch import nn
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            # from TTS.utils.audio.torch_transforms import TorchSTFT
         | 
| 5 | 
            +
            from TTS.encoder.models.base_encoder import BaseEncoder
         | 
| 6 | 
            +
             | 
| 7 | 
            +
             | 
| 8 | 
            +
            class SELayer(nn.Module):
         | 
| 9 | 
            +
                def __init__(self, channel, reduction=8):
         | 
| 10 | 
            +
                    super(SELayer, self).__init__()
         | 
| 11 | 
            +
                    self.avg_pool = nn.AdaptiveAvgPool2d(1)
         | 
| 12 | 
            +
                    self.fc = nn.Sequential(
         | 
| 13 | 
            +
                        nn.Linear(channel, channel // reduction),
         | 
| 14 | 
            +
                        nn.ReLU(inplace=True),
         | 
| 15 | 
            +
                        nn.Linear(channel // reduction, channel),
         | 
| 16 | 
            +
                        nn.Sigmoid(),
         | 
| 17 | 
            +
                    )
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                def forward(self, x):
         | 
| 20 | 
            +
                    b, c, _, _ = x.size()
         | 
| 21 | 
            +
                    y = self.avg_pool(x).view(b, c)
         | 
| 22 | 
            +
                    y = self.fc(y).view(b, c, 1, 1)
         | 
| 23 | 
            +
                    return x * y
         | 
| 24 | 
            +
             | 
| 25 | 
            +
             | 
| 26 | 
            +
            class SEBasicBlock(nn.Module):
         | 
| 27 | 
            +
                expansion = 1
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
         | 
| 30 | 
            +
                    super(SEBasicBlock, self).__init__()
         | 
| 31 | 
            +
                    self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
         | 
| 32 | 
            +
                    self.bn1 = nn.BatchNorm2d(planes)
         | 
| 33 | 
            +
                    self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
         | 
| 34 | 
            +
                    self.bn2 = nn.BatchNorm2d(planes)
         | 
| 35 | 
            +
                    self.relu = nn.ReLU(inplace=True)
         | 
| 36 | 
            +
                    self.se = SELayer(planes, reduction)
         | 
| 37 | 
            +
                    self.downsample = downsample
         | 
| 38 | 
            +
                    self.stride = stride
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                def forward(self, x):
         | 
| 41 | 
            +
                    residual = x
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                    out = self.conv1(x)
         | 
| 44 | 
            +
                    out = self.relu(out)
         | 
| 45 | 
            +
                    out = self.bn1(out)
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                    out = self.conv2(out)
         | 
| 48 | 
            +
                    out = self.bn2(out)
         | 
| 49 | 
            +
                    out = self.se(out)
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                    if self.downsample is not None:
         | 
| 52 | 
            +
                        residual = self.downsample(x)
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                    out += residual
         | 
| 55 | 
            +
                    out = self.relu(out)
         | 
| 56 | 
            +
                    return out
         | 
| 57 | 
            +
             | 
| 58 | 
            +
             | 
| 59 | 
            +
            class ResNetSpeakerEncoder(BaseEncoder):
         | 
| 60 | 
            +
                """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
         | 
| 61 | 
            +
                Adapted from: https://github.com/clovaai/voxceleb_trainer
         | 
| 62 | 
            +
                """
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                # pylint: disable=W0102
         | 
| 65 | 
            +
                def __init__(
         | 
| 66 | 
            +
                    self,
         | 
| 67 | 
            +
                    input_dim=64,
         | 
| 68 | 
            +
                    proj_dim=512,
         | 
| 69 | 
            +
                    layers=[3, 4, 6, 3],
         | 
| 70 | 
            +
                    num_filters=[32, 64, 128, 256],
         | 
| 71 | 
            +
                    encoder_type="ASP",
         | 
| 72 | 
            +
                    log_input=False,
         | 
| 73 | 
            +
                    use_torch_spec=False,
         | 
| 74 | 
            +
                    audio_config=None,
         | 
| 75 | 
            +
                ):
         | 
| 76 | 
            +
                    super(ResNetSpeakerEncoder, self).__init__()
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                    self.encoder_type = encoder_type
         | 
| 79 | 
            +
                    self.input_dim = input_dim
         | 
| 80 | 
            +
                    self.log_input = log_input
         | 
| 81 | 
            +
                    self.use_torch_spec = use_torch_spec
         | 
| 82 | 
            +
                    self.audio_config = audio_config
         | 
| 83 | 
            +
                    self.proj_dim = proj_dim
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                    self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
         | 
| 86 | 
            +
                    self.relu = nn.ReLU(inplace=True)
         | 
| 87 | 
            +
                    self.bn1 = nn.BatchNorm2d(num_filters[0])
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                    self.inplanes = num_filters[0]
         | 
| 90 | 
            +
                    self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
         | 
| 91 | 
            +
                    self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
         | 
| 92 | 
            +
                    self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
         | 
| 93 | 
            +
                    self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                    self.instancenorm = nn.InstanceNorm1d(input_dim)
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                    if self.use_torch_spec:
         | 
| 98 | 
            +
                        self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
         | 
| 99 | 
            +
                    else:
         | 
| 100 | 
            +
                        self.torch_spec = None
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                    outmap_size = int(self.input_dim / 8)
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                    self.attention = nn.Sequential(
         | 
| 105 | 
            +
                        nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
         | 
| 106 | 
            +
                        nn.ReLU(),
         | 
| 107 | 
            +
                        nn.BatchNorm1d(128),
         | 
| 108 | 
            +
                        nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
         | 
| 109 | 
            +
                        nn.Softmax(dim=2),
         | 
| 110 | 
            +
                    )
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                    if self.encoder_type == "SAP":
         | 
| 113 | 
            +
                        out_dim = num_filters[3] * outmap_size
         | 
| 114 | 
            +
                    elif self.encoder_type == "ASP":
         | 
| 115 | 
            +
                        out_dim = num_filters[3] * outmap_size * 2
         | 
| 116 | 
            +
                    else:
         | 
| 117 | 
            +
                        raise ValueError("Undefined encoder")
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                    self.fc = nn.Linear(out_dim, proj_dim)
         | 
| 120 | 
            +
             | 
| 121 | 
            +
                    self._init_layers()
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                def _init_layers(self):
         | 
| 124 | 
            +
                    for m in self.modules():
         | 
| 125 | 
            +
                        if isinstance(m, nn.Conv2d):
         | 
| 126 | 
            +
                            nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
         | 
| 127 | 
            +
                        elif isinstance(m, nn.BatchNorm2d):
         | 
| 128 | 
            +
                            nn.init.constant_(m.weight, 1)
         | 
| 129 | 
            +
                            nn.init.constant_(m.bias, 0)
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                def create_layer(self, block, planes, blocks, stride=1):
         | 
| 132 | 
            +
                    downsample = None
         | 
| 133 | 
            +
                    if stride != 1 or self.inplanes != planes * block.expansion:
         | 
| 134 | 
            +
                        downsample = nn.Sequential(
         | 
| 135 | 
            +
                            nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
         | 
| 136 | 
            +
                            nn.BatchNorm2d(planes * block.expansion),
         | 
| 137 | 
            +
                        )
         | 
| 138 | 
            +
             | 
| 139 | 
            +
                    layers = []
         | 
| 140 | 
            +
                    layers.append(block(self.inplanes, planes, stride, downsample))
         | 
| 141 | 
            +
                    self.inplanes = planes * block.expansion
         | 
| 142 | 
            +
                    for _ in range(1, blocks):
         | 
| 143 | 
            +
                        layers.append(block(self.inplanes, planes))
         | 
| 144 | 
            +
             | 
| 145 | 
            +
                    return nn.Sequential(*layers)
         | 
| 146 | 
            +
             | 
| 147 | 
            +
                # pylint: disable=R0201
         | 
| 148 | 
            +
                def new_parameter(self, *size):
         | 
| 149 | 
            +
                    out = nn.Parameter(torch.FloatTensor(*size))
         | 
| 150 | 
            +
                    nn.init.xavier_normal_(out)
         | 
| 151 | 
            +
                    return out
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                def forward(self, x, l2_norm=False):
         | 
| 154 | 
            +
                    """Forward pass of the model.
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                    Args:
         | 
| 157 | 
            +
                        x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
         | 
| 158 | 
            +
                            to compute the spectrogram on-the-fly.
         | 
| 159 | 
            +
                        l2_norm (bool): Whether to L2-normalize the outputs.
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                    Shapes:
         | 
| 162 | 
            +
                        - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
         | 
| 163 | 
            +
                    """
         | 
| 164 | 
            +
                    x.squeeze_(1)
         | 
| 165 | 
            +
                    # if you torch spec compute it otherwise use the mel spec computed by the AP
         | 
| 166 | 
            +
                    if self.use_torch_spec:
         | 
| 167 | 
            +
                        x = self.torch_spec(x)
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                    if self.log_input:
         | 
| 170 | 
            +
                        x = (x + 1e-6).log()
         | 
| 171 | 
            +
                    x = self.instancenorm(x).unsqueeze(1)
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                    x = self.conv1(x)
         | 
| 174 | 
            +
                    x = self.relu(x)
         | 
| 175 | 
            +
                    x = self.bn1(x)
         | 
| 176 | 
            +
             | 
| 177 | 
            +
                    x = self.layer1(x)
         | 
| 178 | 
            +
                    x = self.layer2(x)
         | 
| 179 | 
            +
                    x = self.layer3(x)
         | 
| 180 | 
            +
                    x = self.layer4(x)
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                    x = x.reshape(x.size()[0], -1, x.size()[-1])
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                    w = self.attention(x)
         | 
| 185 | 
            +
             | 
| 186 | 
            +
                    if self.encoder_type == "SAP":
         | 
| 187 | 
            +
                        x = torch.sum(x * w, dim=2)
         | 
| 188 | 
            +
                    elif self.encoder_type == "ASP":
         | 
| 189 | 
            +
                        mu = torch.sum(x * w, dim=2)
         | 
| 190 | 
            +
                        sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5))
         | 
| 191 | 
            +
                        x = torch.cat((mu, sg), 1)
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                    x = x.view(x.size()[0], -1)
         | 
| 194 | 
            +
                    x = self.fc(x)
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                    if l2_norm:
         | 
| 197 | 
            +
                        x = torch.nn.functional.normalize(x, p=2, dim=1)
         | 
| 198 | 
            +
                    return x
         | 
    	
        TTS/encoder/requirements.txt
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            umap-learn
         | 
| 2 | 
            +
            numpy>=1.17.0
         | 
    	
        TTS/encoder/utils/__init__.py
    ADDED
    
    | 
            File without changes
         | 
    	
        TTS/encoder/utils/__pycache__/__init__.cpython-39.pyc
    ADDED
    
    | Binary file (166 Bytes). View file | 
|  | 
    	
        TTS/encoder/utils/__pycache__/generic_utils.cpython-39.pyc
    ADDED
    
    | Binary file (5.01 kB). View file | 
|  | 
    	
        TTS/encoder/utils/generic_utils.py
    ADDED
    
    | @@ -0,0 +1,182 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import datetime
         | 
| 2 | 
            +
            import glob
         | 
| 3 | 
            +
            import os
         | 
| 4 | 
            +
            import random
         | 
| 5 | 
            +
            import re
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            import numpy as np
         | 
| 8 | 
            +
            from scipy import signal
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            from TTS.encoder.models.lstm import LSTMSpeakerEncoder
         | 
| 11 | 
            +
            from TTS.encoder.models.resnet import ResNetSpeakerEncoder
         | 
| 12 | 
            +
            from TTS.utils.io import save_fsspec
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            class AugmentWAV(object):
         | 
| 16 | 
            +
                def __init__(self, ap, augmentation_config):
         | 
| 17 | 
            +
                    self.ap = ap
         | 
| 18 | 
            +
                    self.use_additive_noise = False
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    if "additive" in augmentation_config.keys():
         | 
| 21 | 
            +
                        self.additive_noise_config = augmentation_config["additive"]
         | 
| 22 | 
            +
                        additive_path = self.additive_noise_config["sounds_path"]
         | 
| 23 | 
            +
                        if additive_path:
         | 
| 24 | 
            +
                            self.use_additive_noise = True
         | 
| 25 | 
            +
                            # get noise types
         | 
| 26 | 
            +
                            self.additive_noise_types = []
         | 
| 27 | 
            +
                            for key in self.additive_noise_config.keys():
         | 
| 28 | 
            +
                                if isinstance(self.additive_noise_config[key], dict):
         | 
| 29 | 
            +
                                    self.additive_noise_types.append(key)
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                            additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                            self.noise_list = {}
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                            for wav_file in additive_files:
         | 
| 36 | 
            +
                                noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
         | 
| 37 | 
            +
                                # ignore not listed directories
         | 
| 38 | 
            +
                                if noise_dir not in self.additive_noise_types:
         | 
| 39 | 
            +
                                    continue
         | 
| 40 | 
            +
                                if not noise_dir in self.noise_list:
         | 
| 41 | 
            +
                                    self.noise_list[noise_dir] = []
         | 
| 42 | 
            +
                                self.noise_list[noise_dir].append(wav_file)
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                            print(
         | 
| 45 | 
            +
                                f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
         | 
| 46 | 
            +
                            )
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                    self.use_rir = False
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                    if "rir" in augmentation_config.keys():
         | 
| 51 | 
            +
                        self.rir_config = augmentation_config["rir"]
         | 
| 52 | 
            +
                        if self.rir_config["rir_path"]:
         | 
| 53 | 
            +
                            self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
         | 
| 54 | 
            +
                            self.use_rir = True
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                        print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                    self.create_augmentation_global_list()
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                def create_augmentation_global_list(self):
         | 
| 61 | 
            +
                    if self.use_additive_noise:
         | 
| 62 | 
            +
                        self.global_noise_list = self.additive_noise_types
         | 
| 63 | 
            +
                    else:
         | 
| 64 | 
            +
                        self.global_noise_list = []
         | 
| 65 | 
            +
                    if self.use_rir:
         | 
| 66 | 
            +
                        self.global_noise_list.append("RIR_AUG")
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                def additive_noise(self, noise_type, audio):
         | 
| 69 | 
            +
                    clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                    noise_list = random.sample(
         | 
| 72 | 
            +
                        self.noise_list[noise_type],
         | 
| 73 | 
            +
                        random.randint(
         | 
| 74 | 
            +
                            self.additive_noise_config[noise_type]["min_num_noises"],
         | 
| 75 | 
            +
                            self.additive_noise_config[noise_type]["max_num_noises"],
         | 
| 76 | 
            +
                        ),
         | 
| 77 | 
            +
                    )
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                    audio_len = audio.shape[0]
         | 
| 80 | 
            +
                    noises_wav = None
         | 
| 81 | 
            +
                    for noise in noise_list:
         | 
| 82 | 
            +
                        noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                        if noiseaudio.shape[0] < audio_len:
         | 
| 85 | 
            +
                            continue
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                        noise_snr = random.uniform(
         | 
| 88 | 
            +
                            self.additive_noise_config[noise_type]["min_snr_in_db"],
         | 
| 89 | 
            +
                            self.additive_noise_config[noise_type]["max_num_noises"],
         | 
| 90 | 
            +
                        )
         | 
| 91 | 
            +
                        noise_db = 10 * np.log10(np.mean(noiseaudio**2) + 1e-4)
         | 
| 92 | 
            +
                        noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                        if noises_wav is None:
         | 
| 95 | 
            +
                            noises_wav = noise_wav
         | 
| 96 | 
            +
                        else:
         | 
| 97 | 
            +
                            noises_wav += noise_wav
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                    # if all possible files is less than audio, choose other files
         | 
| 100 | 
            +
                    if noises_wav is None:
         | 
| 101 | 
            +
                        return self.additive_noise(noise_type, audio)
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                    return audio + noises_wav
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                def reverberate(self, audio):
         | 
| 106 | 
            +
                    audio_len = audio.shape[0]
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                    rir_file = random.choice(self.rir_files)
         | 
| 109 | 
            +
                    rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
         | 
| 110 | 
            +
                    rir = rir / np.sqrt(np.sum(rir**2))
         | 
| 111 | 
            +
                    return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                def apply_one(self, audio):
         | 
| 114 | 
            +
                    noise_type = random.choice(self.global_noise_list)
         | 
| 115 | 
            +
                    if noise_type == "RIR_AUG":
         | 
| 116 | 
            +
                        return self.reverberate(audio)
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                    return self.additive_noise(noise_type, audio)
         | 
| 119 | 
            +
             | 
| 120 | 
            +
             | 
| 121 | 
            +
            def to_camel(text):
         | 
| 122 | 
            +
                text = text.capitalize()
         | 
| 123 | 
            +
                return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
         | 
| 124 | 
            +
             | 
| 125 | 
            +
             | 
| 126 | 
            +
            def setup_encoder_model(config: "Coqpit"):
         | 
| 127 | 
            +
                if config.model_params["model_name"].lower() == "lstm":
         | 
| 128 | 
            +
                    model = LSTMSpeakerEncoder(
         | 
| 129 | 
            +
                        config.model_params["input_dim"],
         | 
| 130 | 
            +
                        config.model_params["proj_dim"],
         | 
| 131 | 
            +
                        config.model_params["lstm_dim"],
         | 
| 132 | 
            +
                        config.model_params["num_lstm_layers"],
         | 
| 133 | 
            +
                        use_torch_spec=config.model_params.get("use_torch_spec", False),
         | 
| 134 | 
            +
                        audio_config=config.audio,
         | 
| 135 | 
            +
                    )
         | 
| 136 | 
            +
                elif config.model_params["model_name"].lower() == "resnet":
         | 
| 137 | 
            +
                    model = ResNetSpeakerEncoder(
         | 
| 138 | 
            +
                        input_dim=config.model_params["input_dim"],
         | 
| 139 | 
            +
                        proj_dim=config.model_params["proj_dim"],
         | 
| 140 | 
            +
                        log_input=config.model_params.get("log_input", False),
         | 
| 141 | 
            +
                        use_torch_spec=config.model_params.get("use_torch_spec", False),
         | 
| 142 | 
            +
                        audio_config=config.audio,
         | 
| 143 | 
            +
                    )
         | 
| 144 | 
            +
                return model
         | 
| 145 | 
            +
             | 
| 146 | 
            +
             | 
| 147 | 
            +
            def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
         | 
| 148 | 
            +
                checkpoint_path = "checkpoint_{}.pth".format(current_step)
         | 
| 149 | 
            +
                checkpoint_path = os.path.join(out_path, checkpoint_path)
         | 
| 150 | 
            +
                print(" | | > Checkpoint saving : {}".format(checkpoint_path))
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                new_state_dict = model.state_dict()
         | 
| 153 | 
            +
                state = {
         | 
| 154 | 
            +
                    "model": new_state_dict,
         | 
| 155 | 
            +
                    "optimizer": optimizer.state_dict() if optimizer is not None else None,
         | 
| 156 | 
            +
                    "criterion": criterion.state_dict(),
         | 
| 157 | 
            +
                    "step": current_step,
         | 
| 158 | 
            +
                    "epoch": epoch,
         | 
| 159 | 
            +
                    "loss": model_loss,
         | 
| 160 | 
            +
                    "date": datetime.date.today().strftime("%B %d, %Y"),
         | 
| 161 | 
            +
                }
         | 
| 162 | 
            +
                save_fsspec(state, checkpoint_path)
         | 
| 163 | 
            +
             | 
| 164 | 
            +
             | 
| 165 | 
            +
            def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
         | 
| 166 | 
            +
                if model_loss < best_loss:
         | 
| 167 | 
            +
                    new_state_dict = model.state_dict()
         | 
| 168 | 
            +
                    state = {
         | 
| 169 | 
            +
                        "model": new_state_dict,
         | 
| 170 | 
            +
                        "optimizer": optimizer.state_dict(),
         | 
| 171 | 
            +
                        "criterion": criterion.state_dict(),
         | 
| 172 | 
            +
                        "step": current_step,
         | 
| 173 | 
            +
                        "epoch": epoch,
         | 
| 174 | 
            +
                        "loss": model_loss,
         | 
| 175 | 
            +
                        "date": datetime.date.today().strftime("%B %d, %Y"),
         | 
| 176 | 
            +
                    }
         | 
| 177 | 
            +
                    best_loss = model_loss
         | 
| 178 | 
            +
                    bestmodel_path = "best_model.pth"
         | 
| 179 | 
            +
                    bestmodel_path = os.path.join(out_path, bestmodel_path)
         | 
| 180 | 
            +
                    print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
         | 
| 181 | 
            +
                    save_fsspec(state, bestmodel_path)
         | 
| 182 | 
            +
                return best_loss
         | 
    	
        TTS/encoder/utils/io.py
    ADDED
    
    | @@ -0,0 +1,38 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import datetime
         | 
| 2 | 
            +
            import os
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            from TTS.utils.io import save_fsspec
         | 
| 5 | 
            +
             | 
| 6 | 
            +
             | 
| 7 | 
            +
            def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
         | 
| 8 | 
            +
                checkpoint_path = "checkpoint_{}.pth".format(current_step)
         | 
| 9 | 
            +
                checkpoint_path = os.path.join(out_path, checkpoint_path)
         | 
| 10 | 
            +
                print(" | | > Checkpoint saving : {}".format(checkpoint_path))
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                new_state_dict = model.state_dict()
         | 
| 13 | 
            +
                state = {
         | 
| 14 | 
            +
                    "model": new_state_dict,
         | 
| 15 | 
            +
                    "optimizer": optimizer.state_dict() if optimizer is not None else None,
         | 
| 16 | 
            +
                    "step": current_step,
         | 
| 17 | 
            +
                    "loss": model_loss,
         | 
| 18 | 
            +
                    "date": datetime.date.today().strftime("%B %d, %Y"),
         | 
| 19 | 
            +
                }
         | 
| 20 | 
            +
                save_fsspec(state, checkpoint_path)
         | 
| 21 | 
            +
             | 
| 22 | 
            +
             | 
| 23 | 
            +
            def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step):
         | 
| 24 | 
            +
                if model_loss < best_loss:
         | 
| 25 | 
            +
                    new_state_dict = model.state_dict()
         | 
| 26 | 
            +
                    state = {
         | 
| 27 | 
            +
                        "model": new_state_dict,
         | 
| 28 | 
            +
                        "optimizer": optimizer.state_dict(),
         | 
| 29 | 
            +
                        "step": current_step,
         | 
| 30 | 
            +
                        "loss": model_loss,
         | 
| 31 | 
            +
                        "date": datetime.date.today().strftime("%B %d, %Y"),
         | 
| 32 | 
            +
                    }
         | 
| 33 | 
            +
                    best_loss = model_loss
         | 
| 34 | 
            +
                    bestmodel_path = "best_model.pth"
         | 
| 35 | 
            +
                    bestmodel_path = os.path.join(out_path, bestmodel_path)
         | 
| 36 | 
            +
                    print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
         | 
| 37 | 
            +
                    save_fsspec(state, bestmodel_path)
         | 
| 38 | 
            +
                return best_loss
         | 
