Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		Yurii Paniv
		
	commited on
		
		
					Commit 
							
							·
						
						316ae6b
	
1
								Parent(s):
							
							dc248e1
								
Release 6.0.0 model
Browse files- .gitignore +1 -0
 - README.md +10 -7
 - app.py +6 -17
 - config.yaml +139 -125
 - requirements.txt +1 -1
 - setup.py +2 -2
 - ukrainian_tts/tts.py +7 -10
 
    	
        .gitignore
    CHANGED
    
    | 
         @@ -135,6 +135,7 @@ dmypy.json 
     | 
|
| 135 | 
         
             
            *.pth.tar
         
     | 
| 136 | 
         
             
            *.pth
         
     | 
| 137 | 
         
             
            *.ark
         
     | 
| 
         | 
|
| 138 | 
         | 
| 139 | 
         
             
            # gradio
         
     | 
| 140 | 
         
             
            gradio_queue.db
         
     | 
| 
         | 
|
| 135 | 
         
             
            *.pth.tar
         
     | 
| 136 | 
         
             
            *.pth
         
     | 
| 137 | 
         
             
            *.ark
         
     | 
| 138 | 
         
            +
            *.npz
         
     | 
| 139 | 
         | 
| 140 | 
         
             
            # gradio
         
     | 
| 141 | 
         
             
            gradio_queue.db
         
     | 
    	
        README.md
    CHANGED
    
    | 
         @@ -38,27 +38,30 @@ If you like my work, please support ❤️ -> [https://send.monobank.ua/jar/48iH 
     | 
|
| 38 | 
         
             
            You're welcome to join UA Speech Recognition and Synthesis community: [Telegram https://t.me/speech_recognition_uk](https://t.me/speech_recognition_uk)
         
     | 
| 39 | 
         
             
            # Examples 🤖
         
     | 
| 40 | 
         | 
| 41 | 
         
            -
            ` 
     | 
| 42 | 
         | 
| 43 | 
         
            -
            https:// 
     | 
| 44 | 
         | 
| 45 | 
         | 
| 46 | 
         
             
            <details>
         
     | 
| 47 | 
         
             
              <summary>More voices 📢🤖</summary>
         
     | 
| 48 | 
         | 
| 49 | 
         
            -
            ` 
     | 
| 50 | 
         | 
| 51 | 
         
            -
            https:// 
     | 
| 52 | 
         | 
| 
         | 
|
| 53 | 
         | 
| 54 | 
         
            -
             
     | 
| 55 | 
         | 
| 56 | 
         
            -
             
     | 
| 57 | 
         | 
| 
         | 
|
| 58 | 
         | 
| 59 | 
         
             
            `Mykyta (male)`:
         
     | 
| 60 | 
         | 
| 61 | 
         
            -
            https:// 
     | 
| 
         | 
|
| 62 | 
         | 
| 63 | 
         
             
            </details>
         
     | 
| 64 | 
         | 
| 
         | 
|
| 38 | 
         
             
            You're welcome to join UA Speech Recognition and Synthesis community: [Telegram https://t.me/speech_recognition_uk](https://t.me/speech_recognition_uk)
         
     | 
| 39 | 
         
             
            # Examples 🤖
         
     | 
| 40 | 
         | 
| 41 | 
         
            +
            `Oleksa (male)`:
         
     | 
| 42 | 
         | 
| 43 | 
         
            +
            https://github.com/robinhad/ukrainian-tts/assets/5759207/ace842ef-06d0-4b1f-ad49-5fda92999dbb
         
     | 
| 44 | 
         | 
| 45 | 
         | 
| 46 | 
         
             
            <details>
         
     | 
| 47 | 
         
             
              <summary>More voices 📢🤖</summary>
         
     | 
| 48 | 
         | 
| 49 | 
         
            +
            `Tetiana (female)`:
         
     | 
| 50 | 
         | 
| 51 | 
         
            +
            https://github.com/robinhad/ukrainian-tts/assets/5759207/a6ecacf6-62ae-4fc5-b6d5-41e6cdd3d992
         
     | 
| 52 | 
         | 
| 53 | 
         
            +
            `Dmytro (male)`:
         
     | 
| 54 | 
         | 
| 55 | 
         
            +
            https://github.com/robinhad/ukrainian-tts/assets/5759207/67d3dac9-6626-40ef-98e5-ec194096bbe0
         
     | 
| 56 | 
         | 
| 57 | 
         
            +
            `Lada (female)`:
         
     | 
| 58 | 
         | 
| 59 | 
         
            +
            https://github.com/robinhad/ukrainian-tts/assets/5759207/fcf558b2-3ff9-4539-ad9e-8455b52223a4
         
     | 
| 60 | 
         | 
| 61 | 
         
             
            `Mykyta (male)`:
         
     | 
| 62 | 
         | 
| 63 | 
         
            +
            https://github.com/robinhad/ukrainian-tts/assets/5759207/033f5215-3f09-4021-ba19-1f55158445ca
         
     | 
| 64 | 
         
            +
             
     | 
| 65 | 
         | 
| 66 | 
         
             
            </details>
         
     | 
| 67 | 
         | 
    	
        app.py
    CHANGED
    
    | 
         @@ -43,6 +43,7 @@ class VoiceOption(Enum): 
     | 
|
| 43 | 
         
             
                Mykyta = "Микита (чоловічий) 👨"
         
     | 
| 44 | 
         
             
                Lada = "Лада (жіночий) 👩"
         
     | 
| 45 | 
         
             
                Dmytro = "Дмитро (чоловічий) 👨"
         
     | 
| 
         | 
|
| 46 | 
         | 
| 47 | 
         | 
| 48 | 
         
             
            print(f"CUDA available? {is_available()}")
         
     | 
| 
         @@ -51,7 +52,7 @@ print(f"CUDA available? {is_available()}") 
     | 
|
| 51 | 
         
             
            ukr_tts = TTS(device="cuda" if is_available() else "cpu")
         
     | 
| 52 | 
         | 
| 53 | 
         | 
| 54 | 
         
            -
            def tts(text: str, voice: str 
     | 
| 55 | 
         
             
                print("============================")
         
     | 
| 56 | 
         
             
                print("Original text:", text)
         
     | 
| 57 | 
         
             
                print("Voice", voice)
         
     | 
| 
         @@ -62,6 +63,7 @@ def tts(text: str, voice: str, speed: float): 
     | 
|
| 62 | 
         
             
                    VoiceOption.Mykyta.value: Voices.Mykyta.value,
         
     | 
| 63 | 
         
             
                    VoiceOption.Lada.value: Voices.Lada.value,
         
     | 
| 64 | 
         
             
                    VoiceOption.Dmytro.value: Voices.Dmytro.value,
         
     | 
| 
         | 
|
| 65 | 
         
             
                }
         
     | 
| 66 | 
         | 
| 67 | 
         
             
                speaker_name = voice_mapping[voice]
         
     | 
| 
         @@ -72,11 +74,11 @@ def tts(text: str, voice: str, speed: float): 
     | 
|
| 72 | 
         | 
| 73 | 
         
             
                if getenv("HF_API_TOKEN") is not None:
         
     | 
| 74 | 
         
             
                    log_queue.put(
         
     | 
| 75 | 
         
            -
                        [text, speaker_name, Stress.Dictionary.value,  
     | 
| 76 | 
         
             
                    )
         
     | 
| 77 | 
         | 
| 78 | 
         
             
                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
         
     | 
| 79 | 
         
            -
                    _, text = ukr_tts.tts(text, speaker_name, Stress.Dictionary.value, fp 
     | 
| 80 | 
         
             
                    return fp.name, text
         
     | 
| 81 | 
         | 
| 82 | 
         | 
| 
         @@ -97,9 +99,6 @@ iface = gr.Interface( 
     | 
|
| 97 | 
         
             
                        choices=[option.value for option in VoiceOption],
         
     | 
| 98 | 
         
             
                        value=VoiceOption.Tetiana.value,
         
     | 
| 99 | 
         
             
                    ),
         
     | 
| 100 | 
         
            -
                    gr.components.Slider(
         
     | 
| 101 | 
         
            -
                        label="Швидкість", minimum=0.5, maximum=2, value=1, step=0.05
         
     | 
| 102 | 
         
            -
                    ),
         
     | 
| 103 | 
         
             
                ],
         
     | 
| 104 | 
         
             
                outputs=[
         
     | 
| 105 | 
         
             
                    gr.components.Audio(label="Output"),
         
     | 
| 
         @@ -112,32 +111,22 @@ iface = gr.Interface( 
     | 
|
| 112 | 
         
             
                    [
         
     | 
| 113 | 
         
             
                        "Привіт, як тебе звати?",
         
     | 
| 114 | 
         
             
                        VoiceOption.Tetiana.value,
         
     | 
| 115 | 
         
            -
                        1,
         
     | 
| 116 | 
         
             
                    ],
         
     | 
| 117 | 
         
             
                    [
         
     | 
| 118 | 
         
             
                        "Введіть, будь ласка, св+оє реч+ення.",
         
     | 
| 119 | 
         
             
                        VoiceOption.Dmytro.value,
         
     | 
| 120 | 
         
            -
                        1,
         
     | 
| 121 | 
         
            -
                    ],
         
     | 
| 122 | 
         
            -
                    [
         
     | 
| 123 | 
         
            -
                        "Введіть, будь ласка, своє речення.",
         
     | 
| 124 | 
         
            -
                        VoiceOption.Dmytro.value,
         
     | 
| 125 | 
         
            -
                        1.3,
         
     | 
| 126 | 
         
             
                    ],
         
     | 
| 127 | 
         
             
                    [
         
     | 
| 128 | 
         
             
                        "Введіть, будь ласка, своє речення.",
         
     | 
| 129 | 
         
            -
                        VoiceOption. 
     | 
| 130 | 
         
            -
                        1,
         
     | 
| 131 | 
         
             
                    ],
         
     | 
| 132 | 
         
             
                    [
         
     | 
| 133 | 
         
             
                        "Введіть, будь ласка, своє речення.",
         
     | 
| 134 | 
         
             
                        VoiceOption.Mykyta.value,
         
     | 
| 135 | 
         
            -
                        0.7,
         
     | 
| 136 | 
         
             
                    ],
         
     | 
| 137 | 
         
             
                    [
         
     | 
| 138 | 
         
             
                        "Договір підписано 4 квітня 1949 року.",
         
     | 
| 139 | 
         
             
                        VoiceOption.Lada.value,
         
     | 
| 140 | 
         
            -
                        0.9,
         
     | 
| 141 | 
         
             
                    ],
         
     | 
| 142 | 
         
             
                ],
         
     | 
| 143 | 
         
             
            )
         
     | 
| 
         | 
|
| 43 | 
         
             
                Mykyta = "Микита (чоловічий) 👨"
         
     | 
| 44 | 
         
             
                Lada = "Лада (жіночий) 👩"
         
     | 
| 45 | 
         
             
                Dmytro = "Дмитро (чоловічий) 👨"
         
     | 
| 46 | 
         
            +
                Oleksa = "Олекса (чоловічий) 👨"
         
     | 
| 47 | 
         | 
| 48 | 
         | 
| 49 | 
         
             
            print(f"CUDA available? {is_available()}")
         
     | 
| 
         | 
|
| 52 | 
         
             
            ukr_tts = TTS(device="cuda" if is_available() else "cpu")
         
     | 
| 53 | 
         | 
| 54 | 
         | 
| 55 | 
         
            +
            def tts(text: str, voice: str):
         
     | 
| 56 | 
         
             
                print("============================")
         
     | 
| 57 | 
         
             
                print("Original text:", text)
         
     | 
| 58 | 
         
             
                print("Voice", voice)
         
     | 
| 
         | 
|
| 63 | 
         
             
                    VoiceOption.Mykyta.value: Voices.Mykyta.value,
         
     | 
| 64 | 
         
             
                    VoiceOption.Lada.value: Voices.Lada.value,
         
     | 
| 65 | 
         
             
                    VoiceOption.Dmytro.value: Voices.Dmytro.value,
         
     | 
| 66 | 
         
            +
                    VoiceOption.Oleksa.value: Voices.Oleksa.value,
         
     | 
| 67 | 
         
             
                }
         
     | 
| 68 | 
         | 
| 69 | 
         
             
                speaker_name = voice_mapping[voice]
         
     | 
| 
         | 
|
| 74 | 
         | 
| 75 | 
         
             
                if getenv("HF_API_TOKEN") is not None:
         
     | 
| 76 | 
         
             
                    log_queue.put(
         
     | 
| 77 | 
         
            +
                        [text, speaker_name, Stress.Dictionary.value, 1, str(datetime.utcnow())]
         
     | 
| 78 | 
         
             
                    )
         
     | 
| 79 | 
         | 
| 80 | 
         
             
                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
         
     | 
| 81 | 
         
            +
                    _, text = ukr_tts.tts(text, speaker_name, Stress.Dictionary.value, fp)
         
     | 
| 82 | 
         
             
                    return fp.name, text
         
     | 
| 83 | 
         | 
| 84 | 
         | 
| 
         | 
|
| 99 | 
         
             
                        choices=[option.value for option in VoiceOption],
         
     | 
| 100 | 
         
             
                        value=VoiceOption.Tetiana.value,
         
     | 
| 101 | 
         
             
                    ),
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 102 | 
         
             
                ],
         
     | 
| 103 | 
         
             
                outputs=[
         
     | 
| 104 | 
         
             
                    gr.components.Audio(label="Output"),
         
     | 
| 
         | 
|
| 111 | 
         
             
                    [
         
     | 
| 112 | 
         
             
                        "Привіт, як тебе звати?",
         
     | 
| 113 | 
         
             
                        VoiceOption.Tetiana.value,
         
     | 
| 
         | 
|
| 114 | 
         
             
                    ],
         
     | 
| 115 | 
         
             
                    [
         
     | 
| 116 | 
         
             
                        "Введіть, будь ласка, св+оє реч+ення.",
         
     | 
| 117 | 
         
             
                        VoiceOption.Dmytro.value,
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 118 | 
         
             
                    ],
         
     | 
| 119 | 
         
             
                    [
         
     | 
| 120 | 
         
             
                        "Введіть, будь ласка, своє речення.",
         
     | 
| 121 | 
         
            +
                        VoiceOption.Oleksa.value,
         
     | 
| 
         | 
|
| 122 | 
         
             
                    ],
         
     | 
| 123 | 
         
             
                    [
         
     | 
| 124 | 
         
             
                        "Введіть, будь ласка, своє речення.",
         
     | 
| 125 | 
         
             
                        VoiceOption.Mykyta.value,
         
     | 
| 
         | 
|
| 126 | 
         
             
                    ],
         
     | 
| 127 | 
         
             
                    [
         
     | 
| 128 | 
         
             
                        "Договір підписано 4 квітня 1949 року.",
         
     | 
| 129 | 
         
             
                        VoiceOption.Lada.value,
         
     | 
| 
         | 
|
| 130 | 
         
             
                    ],
         
     | 
| 131 | 
         
             
                ],
         
     | 
| 132 | 
         
             
            )
         
     | 
    	
        config.yaml
    CHANGED
    
    | 
         @@ -1,11 +1,11 @@ 
     | 
|
| 1 | 
         
            -
            config: ./conf/tuning/ 
     | 
| 2 | 
         
             
            print_config: false
         
     | 
| 3 | 
         
             
            log_level: INFO
         
     | 
| 4 | 
         
             
            dry_run: false
         
     | 
| 5 | 
         
             
            iterator_type: sequence
         
     | 
| 6 | 
         
            -
            output_dir: exp/22k/ 
     | 
| 7 | 
         
             
            ngpu: 1
         
     | 
| 8 | 
         
            -
            seed:  
     | 
| 9 | 
         
             
            num_workers: 4
         
     | 
| 10 | 
         
             
            num_att_plot: 3
         
     | 
| 11 | 
         
             
            dist_backend: nccl
         
     | 
| 
         @@ -24,7 +24,7 @@ cudnn_benchmark: false 
     | 
|
| 24 | 
         
             
            cudnn_deterministic: false
         
     | 
| 25 | 
         
             
            collect_stats: false
         
     | 
| 26 | 
         
             
            write_collected_feats: false
         
     | 
| 27 | 
         
            -
            max_epoch:  
     | 
| 28 | 
         
             
            patience: null
         
     | 
| 29 | 
         
             
            val_scheduler_criterion:
         
     | 
| 30 | 
         
             
            - valid
         
     | 
| 
         @@ -34,10 +34,16 @@ early_stopping_criterion: 
     | 
|
| 34 | 
         
             
            - loss
         
     | 
| 35 | 
         
             
            - min
         
     | 
| 36 | 
         
             
            best_model_criterion:
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 37 | 
         
             
            -   - train
         
     | 
| 38 | 
         
             
                - total_count
         
     | 
| 39 | 
         
             
                - max
         
     | 
| 40 | 
         
            -
            keep_nbest_models:  
     | 
| 41 | 
         
             
            nbest_averaging_interval: 0
         
     | 
| 42 | 
         
             
            grad_clip: -1
         
     | 
| 43 | 
         
             
            grad_clip_type: 2.0
         
     | 
| 
         @@ -59,20 +65,23 @@ wandb_name: null 
     | 
|
| 59 | 
         
             
            wandb_model_log_interval: -1
         
     | 
| 60 | 
         
             
            detect_anomaly: false
         
     | 
| 61 | 
         
             
            pretrain_path: null
         
     | 
| 62 | 
         
            -
            init_param: 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 63 | 
         
             
            ignore_init_mismatch: false
         
     | 
| 64 | 
         
             
            freeze_param: []
         
     | 
| 65 | 
         
             
            num_iters_per_epoch: null
         
     | 
| 66 | 
         
             
            batch_size: 20
         
     | 
| 67 | 
         
             
            valid_batch_size: null
         
     | 
| 68 | 
         
            -
            batch_bins:  
     | 
| 69 | 
         
             
            valid_batch_bins: null
         
     | 
| 70 | 
         
             
            train_shape_file:
         
     | 
| 71 | 
         
            -
            - exp/22k/ 
     | 
| 72 | 
         
            -
            - exp/22k/ 
     | 
| 73 | 
         
             
            valid_shape_file:
         
     | 
| 74 | 
         
            -
            - exp/22k/ 
     | 
| 75 | 
         
            -
            - exp/22k/ 
     | 
| 76 | 
         
             
            batch_type: numel
         
     | 
| 77 | 
         
             
            valid_batch_type: null
         
     | 
| 78 | 
         
             
            fold_length:
         
     | 
| 
         @@ -110,29 +119,27 @@ max_cache_fd: 32 
     | 
|
| 110 | 
         
             
            valid_max_cache_size: null
         
     | 
| 111 | 
         
             
            exclude_weight_decay: false
         
     | 
| 112 | 
         
             
            exclude_weight_decay_conf: {}
         
     | 
| 113 | 
         
            -
            optim:  
     | 
| 114 | 
         
             
            optim_conf:
         
     | 
| 115 | 
         
            -
                lr:  
     | 
| 116 | 
         
             
                betas:
         
     | 
| 117 | 
         
            -
                - 0. 
     | 
| 118 | 
         
            -
                - 0. 
     | 
| 119 | 
         
            -
                eps: 1.0e-09
         
     | 
| 120 | 
         
             
                weight_decay: 0.0
         
     | 
| 121 | 
         
             
            scheduler: exponentiallr
         
     | 
| 122 | 
         
             
            scheduler_conf:
         
     | 
| 123 | 
         
             
                gamma: 0.999875
         
     | 
| 124 | 
         
            -
            optim2:  
     | 
| 125 | 
         
             
            optim2_conf:
         
     | 
| 126 | 
         
            -
                lr:  
     | 
| 127 | 
         
             
                betas:
         
     | 
| 128 | 
         
            -
                - 0. 
     | 
| 129 | 
         
            -
                - 0. 
     | 
| 130 | 
         
            -
                eps: 1.0e-09
         
     | 
| 131 | 
         
             
                weight_decay: 0.0
         
     | 
| 132 | 
         
             
            scheduler2: exponentiallr
         
     | 
| 133 | 
         
             
            scheduler2_conf:
         
     | 
| 134 | 
         
             
                gamma: 0.999875
         
     | 
| 135 | 
         
            -
            generator_first:  
     | 
| 136 | 
         
             
            token_list:
         
     | 
| 137 | 
         
             
            - <blank>
         
     | 
| 138 | 
         
             
            - <unk>
         
     | 
| 
         @@ -154,14 +161,13 @@ token_list: 
     | 
|
| 154 | 
         
             
            - к
         
     | 
| 155 | 
         
             
            - м
         
     | 
| 156 | 
         
             
            - п
         
     | 
| 157 | 
         
            -
            - .
         
     | 
| 158 | 
         
             
            - я
         
     | 
| 159 | 
         
             
            - з
         
     | 
| 160 | 
         
             
            - ','
         
     | 
| 161 | 
         
             
            - б
         
     | 
| 162 | 
         
             
            - ь
         
     | 
| 163 | 
         
            -
            - ч
         
     | 
| 164 | 
         
             
            - г
         
     | 
| 
         | 
|
| 165 | 
         
             
            - й
         
     | 
| 166 | 
         
             
            - ж
         
     | 
| 167 | 
         
             
            - х
         
     | 
| 
         @@ -176,13 +182,12 @@ token_list: 
     | 
|
| 176 | 
         
             
            - '!'
         
     | 
| 177 | 
         
             
            - ''''
         
     | 
| 178 | 
         
             
            - ф
         
     | 
| 
         | 
|
| 179 | 
         
             
            - '"'
         
     | 
| 180 | 
         
            -
            - ':'
         
     | 
| 181 | 
         
             
            - ґ
         
     | 
| 182 | 
         
            -
            -  
     | 
| 183 | 
         
            -
            - )
         
     | 
| 184 | 
         
            -
            - „
         
     | 
| 185 | 
         
             
            - /
         
     | 
| 
         | 
|
| 186 | 
         
             
            - <sos/eos>
         
     | 
| 187 | 
         
             
            odim: null
         
     | 
| 188 | 
         
             
            model_conf: {}
         
     | 
| 
         @@ -192,54 +197,67 @@ bpemodel: null 
     | 
|
| 192 | 
         
             
            non_linguistic_symbols: null
         
     | 
| 193 | 
         
             
            cleaner: null
         
     | 
| 194 | 
         
             
            g2p: g2p_en
         
     | 
| 195 | 
         
            -
            feats_extract:  
     | 
| 196 | 
         
             
            feats_extract_conf:
         
     | 
| 197 | 
         
             
                n_fft: 1024
         
     | 
| 198 | 
         
             
                hop_length: 256
         
     | 
| 199 | 
         
             
                win_length: null
         
     | 
| 200 | 
         
            -
             
     | 
| 201 | 
         
            -
             
     | 
| 202 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 203 | 
         
             
            tts_conf:
         
     | 
| 204 | 
         
            -
                 
     | 
| 205 | 
         
            -
                 
     | 
| 206 | 
         
            -
                     
     | 
| 207 | 
         
            -
                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 208 | 
         
             
                    spk_embed_dim: 192
         
     | 
| 209 | 
         
            -
                     
     | 
| 210 | 
         
            -
                     
     | 
| 211 | 
         
            -
                     
     | 
| 212 | 
         
            -
                     
     | 
| 213 | 
         
            -
                     
     | 
| 214 | 
         
            -
                     
     | 
| 215 | 
         
            -
                     
     | 
| 216 | 
         
            -
                     
     | 
| 217 | 
         
            -
                     
     | 
| 218 | 
         
            -
                     
     | 
| 219 | 
         
            -
                     
     | 
| 220 | 
         
            -
             
     | 
| 221 | 
         
            -
             
     | 
| 222 | 
         
            -
                     
     | 
| 223 | 
         
            -
                     
     | 
| 224 | 
         
            -
                     
     | 
| 225 | 
         
            -
                     
     | 
| 226 | 
         
            -
                     
     | 
| 227 | 
         
            -
                     
     | 
| 228 | 
         
            -
             
     | 
| 229 | 
         
            -
                     
     | 
| 230 | 
         
            -
                     
     | 
| 231 | 
         
            -
                    - 2
         
     | 
| 232 | 
         
            -
                    - 2
         
     | 
| 233 | 
         
            -
                    decoder_upsample_kernel_sizes:
         
     | 
| 234 | 
         
            -
                    - 16
         
     | 
| 235 | 
         
            -
                    - 16
         
     | 
| 236 | 
         
            -
                    - 4
         
     | 
| 237 | 
         
            -
                    - 4
         
     | 
| 238 | 
         
            -
                    decoder_resblock_kernel_sizes:
         
     | 
| 239 | 
         
            -
                    - 3
         
     | 
| 240 | 
         
            -
                    - 7
         
     | 
| 241 | 
         
            -
                    - 11
         
     | 
| 242 | 
         
            -
                    decoder_resblock_dilations:
         
     | 
| 243 | 
         
             
                    -   - 1
         
     | 
| 244 | 
         
             
                        - 3
         
     | 
| 245 | 
         
             
                        - 5
         
     | 
| 
         @@ -249,94 +267,90 @@ tts_conf: 
     | 
|
| 249 | 
         
             
                    -   - 1
         
     | 
| 250 | 
         
             
                        - 3
         
     | 
| 251 | 
         
             
                        - 5
         
     | 
| 252 | 
         
            -
                     
     | 
| 253 | 
         
            -
                     
     | 
| 254 | 
         
            -
                     
     | 
| 255 | 
         
            -
                     
     | 
| 256 | 
         
            -
                     
     | 
| 257 | 
         
            -
                     
     | 
| 258 | 
         
            -
                     
     | 
| 259 | 
         
            -
                     
     | 
| 260 | 
         
            -
                     
     | 
| 261 | 
         
            -
                     
     | 
| 262 | 
         
            -
                     
     | 
| 263 | 
         
            -
                     
     | 
| 264 | 
         
            -
                     
     | 
| 265 | 
         
            -
                     
     | 
| 266 | 
         
            -
                     
     | 
| 267 | 
         
            -
                     
     | 
| 268 | 
         
            -
                    stochastic_duration_predictor_flows: 4
         
     | 
| 269 | 
         
            -
                    stochastic_duration_predictor_dds_conv_layers: 3
         
     | 
| 270 | 
         
            -
                    vocabs: 50
         
     | 
| 271 | 
         
            -
                    aux_channels: 513
         
     | 
| 272 | 
         
             
                discriminator_type: hifigan_multi_scale_multi_period_discriminator
         
     | 
| 273 | 
         
             
                discriminator_params:
         
     | 
| 274 | 
         
            -
                     
     | 
| 275 | 
         
            -
                     
     | 
| 276 | 
         
            -
             
     | 
| 277 | 
         
            -
                         
     | 
| 278 | 
         
            -
                         
     | 
| 279 | 
         
            -
                         
     | 
| 280 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 281 | 
         
             
                        in_channels: 1
         
     | 
| 282 | 
         
            -
                        out_channels: 1
         
     | 
| 283 | 
         
             
                        kernel_sizes:
         
     | 
| 284 | 
         
            -
                        - 15
         
     | 
| 285 | 
         
            -
                        - 41
         
     | 
| 286 | 
         
             
                        - 5
         
     | 
| 287 | 
         
             
                        - 3
         
     | 
| 288 | 
         
            -
                        channels: 128
         
     | 
| 289 | 
         
             
                        max_downsample_channels: 1024
         
     | 
| 290 | 
         
            -
                        max_groups: 16
         
     | 
| 291 | 
         
            -
                        bias: true
         
     | 
| 292 | 
         
            -
                        downsample_scales:
         
     | 
| 293 | 
         
            -
                        - 2
         
     | 
| 294 | 
         
            -
                        - 2
         
     | 
| 295 | 
         
            -
                        - 4
         
     | 
| 296 | 
         
            -
                        - 4
         
     | 
| 297 | 
         
            -
                        - 1
         
     | 
| 298 | 
         
             
                        nonlinear_activation: LeakyReLU
         
     | 
| 299 | 
         
             
                        nonlinear_activation_params:
         
     | 
| 300 | 
         
             
                            negative_slope: 0.1
         
     | 
| 301 | 
         
            -
                         
     | 
| 302 | 
         
             
                        use_spectral_norm: false
         
     | 
| 303 | 
         
            -
             
     | 
| 304 | 
         
             
                    periods:
         
     | 
| 305 | 
         
             
                    - 2
         
     | 
| 306 | 
         
             
                    - 3
         
     | 
| 307 | 
         
             
                    - 5
         
     | 
| 308 | 
         
             
                    - 7
         
     | 
| 309 | 
         
             
                    - 11
         
     | 
| 310 | 
         
            -
                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 311 | 
         
             
                        in_channels: 1
         
     | 
| 312 | 
         
            -
                        out_channels: 1
         
     | 
| 313 | 
         
             
                        kernel_sizes:
         
     | 
| 
         | 
|
| 
         | 
|
| 314 | 
         
             
                        - 5
         
     | 
| 315 | 
         
             
                        - 3
         
     | 
| 316 | 
         
            -
                        channels: 32
         
     | 
| 317 | 
         
            -
                        downsample_scales:
         
     | 
| 318 | 
         
            -
                        - 3
         
     | 
| 319 | 
         
            -
                        - 3
         
     | 
| 320 | 
         
            -
                        - 3
         
     | 
| 321 | 
         
            -
                        - 3
         
     | 
| 322 | 
         
            -
                        - 1
         
     | 
| 323 | 
         
             
                        max_downsample_channels: 1024
         
     | 
| 324 | 
         
            -
                         
     | 
| 325 | 
         
             
                        nonlinear_activation: LeakyReLU
         
     | 
| 326 | 
         
             
                        nonlinear_activation_params:
         
     | 
| 327 | 
         
             
                            negative_slope: 0.1
         
     | 
| 328 | 
         
            -
                         
     | 
| 329 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 330 | 
         
             
                generator_adv_loss_params:
         
     | 
| 331 | 
         
             
                    average_by_discriminators: false
         
     | 
| 332 | 
         
             
                    loss_type: mse
         
     | 
| 333 | 
         
             
                discriminator_adv_loss_params:
         
     | 
| 334 | 
         
             
                    average_by_discriminators: false
         
     | 
| 335 | 
         
             
                    loss_type: mse
         
     | 
| 
         | 
|
| 336 | 
         
             
                feat_match_loss_params:
         
     | 
| 337 | 
         
             
                    average_by_discriminators: false
         
     | 
| 338 | 
         
             
                    average_by_layers: false
         
     | 
| 339 | 
         
             
                    include_final_outputs: true
         
     | 
| 
         | 
|
| 340 | 
         
             
                mel_loss_params:
         
     | 
| 341 | 
         
             
                    fs: 22050
         
     | 
| 342 | 
         
             
                    n_fft: 1024
         
     | 
| 
         @@ -347,12 +361,12 @@ tts_conf: 
     | 
|
| 347 | 
         
             
                    fmin: 0
         
     | 
| 348 | 
         
             
                    fmax: null
         
     | 
| 349 | 
         
             
                    log_base: null
         
     | 
| 
         | 
|
| 350 | 
         
             
                lambda_adv: 1.0
         
     | 
| 351 | 
         
             
                lambda_mel: 45.0
         
     | 
| 352 | 
         
             
                lambda_feat_match: 2.0
         
     | 
| 353 | 
         
            -
                lambda_dur: 1.0
         
     | 
| 354 | 
         
            -
                lambda_kl: 1.0
         
     | 
| 355 | 
         
             
                sampling_rate: 22050
         
     | 
| 
         | 
|
| 356 | 
         
             
                cache_generator_outputs: true
         
     | 
| 357 | 
         
             
            pitch_extract: null
         
     | 
| 358 | 
         
             
            pitch_extract_conf: {}
         
     | 
| 
         | 
|
| 1 | 
         
            +
            config: ./conf/tuning/finetune_joint_tacotron2_hifigan.yaml
         
     | 
| 2 | 
         
             
            print_config: false
         
     | 
| 3 | 
         
             
            log_level: INFO
         
     | 
| 4 | 
         
             
            dry_run: false
         
     | 
| 5 | 
         
             
            iterator_type: sequence
         
     | 
| 6 | 
         
            +
            output_dir: exp/22k/tts_finetune_joint_tacotron2_hifigan_raw_char
         
     | 
| 7 | 
         
             
            ngpu: 1
         
     | 
| 8 | 
         
            +
            seed: 777
         
     | 
| 9 | 
         
             
            num_workers: 4
         
     | 
| 10 | 
         
             
            num_att_plot: 3
         
     | 
| 11 | 
         
             
            dist_backend: nccl
         
     | 
| 
         | 
|
| 24 | 
         
             
            cudnn_deterministic: false
         
     | 
| 25 | 
         
             
            collect_stats: false
         
     | 
| 26 | 
         
             
            write_collected_feats: false
         
     | 
| 27 | 
         
            +
            max_epoch: 140
         
     | 
| 28 | 
         
             
            patience: null
         
     | 
| 29 | 
         
             
            val_scheduler_criterion:
         
     | 
| 30 | 
         
             
            - valid
         
     | 
| 
         | 
|
| 34 | 
         
             
            - loss
         
     | 
| 35 | 
         
             
            - min
         
     | 
| 36 | 
         
             
            best_model_criterion:
         
     | 
| 37 | 
         
            +
            -   - valid
         
     | 
| 38 | 
         
            +
                - text2mel_loss
         
     | 
| 39 | 
         
            +
                - min
         
     | 
| 40 | 
         
            +
            -   - train
         
     | 
| 41 | 
         
            +
                - text2mel_loss
         
     | 
| 42 | 
         
            +
                - min
         
     | 
| 43 | 
         
             
            -   - train
         
     | 
| 44 | 
         
             
                - total_count
         
     | 
| 45 | 
         
             
                - max
         
     | 
| 46 | 
         
            +
            keep_nbest_models: 5
         
     | 
| 47 | 
         
             
            nbest_averaging_interval: 0
         
     | 
| 48 | 
         
             
            grad_clip: -1
         
     | 
| 49 | 
         
             
            grad_clip_type: 2.0
         
     | 
| 
         | 
|
| 65 | 
         
             
            wandb_model_log_interval: -1
         
     | 
| 66 | 
         
             
            detect_anomaly: false
         
     | 
| 67 | 
         
             
            pretrain_path: null
         
     | 
| 68 | 
         
            +
            init_param:
         
     | 
| 69 | 
         
            +
            - exp/22k/tts_train_tacotron2_raw_char/train.loss.ave_5best.pth:tts:tts.generator.text2mel
         
     | 
| 70 | 
         
            +
            - exp/22k/ljspeech_hifigan.v1/generator.pth::tts.generator.vocoder
         
     | 
| 71 | 
         
            +
            - exp/22k/ljspeech_hifigan.v1/discriminator.pth::tts.discriminator
         
     | 
| 72 | 
         
             
            ignore_init_mismatch: false
         
     | 
| 73 | 
         
             
            freeze_param: []
         
     | 
| 74 | 
         
             
            num_iters_per_epoch: null
         
     | 
| 75 | 
         
             
            batch_size: 20
         
     | 
| 76 | 
         
             
            valid_batch_size: null
         
     | 
| 77 | 
         
            +
            batch_bins: 1600000
         
     | 
| 78 | 
         
             
            valid_batch_bins: null
         
     | 
| 79 | 
         
             
            train_shape_file:
         
     | 
| 80 | 
         
            +
            - exp/22k/tts_stats_raw_char/train/text_shape.char
         
     | 
| 81 | 
         
            +
            - exp/22k/tts_stats_raw_char/train/speech_shape
         
     | 
| 82 | 
         
             
            valid_shape_file:
         
     | 
| 83 | 
         
            +
            - exp/22k/tts_stats_raw_char/valid/text_shape.char
         
     | 
| 84 | 
         
            +
            - exp/22k/tts_stats_raw_char/valid/speech_shape
         
     | 
| 85 | 
         
             
            batch_type: numel
         
     | 
| 86 | 
         
             
            valid_batch_type: null
         
     | 
| 87 | 
         
             
            fold_length:
         
     | 
| 
         | 
|
| 119 | 
         
             
            valid_max_cache_size: null
         
     | 
| 120 | 
         
             
            exclude_weight_decay: false
         
     | 
| 121 | 
         
             
            exclude_weight_decay_conf: {}
         
     | 
| 122 | 
         
            +
            optim: adam
         
     | 
| 123 | 
         
             
            optim_conf:
         
     | 
| 124 | 
         
            +
                lr: 1.25e-05
         
     | 
| 125 | 
         
             
                betas:
         
     | 
| 126 | 
         
            +
                - 0.5
         
     | 
| 127 | 
         
            +
                - 0.9
         
     | 
| 
         | 
|
| 128 | 
         
             
                weight_decay: 0.0
         
     | 
| 129 | 
         
             
            scheduler: exponentiallr
         
     | 
| 130 | 
         
             
            scheduler_conf:
         
     | 
| 131 | 
         
             
                gamma: 0.999875
         
     | 
| 132 | 
         
            +
            optim2: adam
         
     | 
| 133 | 
         
             
            optim2_conf:
         
     | 
| 134 | 
         
            +
                lr: 1.25e-05
         
     | 
| 135 | 
         
             
                betas:
         
     | 
| 136 | 
         
            +
                - 0.5
         
     | 
| 137 | 
         
            +
                - 0.9
         
     | 
| 
         | 
|
| 138 | 
         
             
                weight_decay: 0.0
         
     | 
| 139 | 
         
             
            scheduler2: exponentiallr
         
     | 
| 140 | 
         
             
            scheduler2_conf:
         
     | 
| 141 | 
         
             
                gamma: 0.999875
         
     | 
| 142 | 
         
            +
            generator_first: true
         
     | 
| 143 | 
         
             
            token_list:
         
     | 
| 144 | 
         
             
            - <blank>
         
     | 
| 145 | 
         
             
            - <unk>
         
     | 
| 
         | 
|
| 161 | 
         
             
            - к
         
     | 
| 162 | 
         
             
            - м
         
     | 
| 163 | 
         
             
            - п
         
     | 
| 
         | 
|
| 164 | 
         
             
            - я
         
     | 
| 165 | 
         
             
            - з
         
     | 
| 166 | 
         
             
            - ','
         
     | 
| 167 | 
         
             
            - б
         
     | 
| 168 | 
         
             
            - ь
         
     | 
| 
         | 
|
| 169 | 
         
             
            - г
         
     | 
| 170 | 
         
            +
            - ч
         
     | 
| 171 | 
         
             
            - й
         
     | 
| 172 | 
         
             
            - ж
         
     | 
| 173 | 
         
             
            - х
         
     | 
| 
         | 
|
| 182 | 
         
             
            - '!'
         
     | 
| 183 | 
         
             
            - ''''
         
     | 
| 184 | 
         
             
            - ф
         
     | 
| 185 | 
         
            +
            - .
         
     | 
| 186 | 
         
             
            - '"'
         
     | 
| 
         | 
|
| 187 | 
         
             
            - ґ
         
     | 
| 188 | 
         
            +
            - ':'
         
     | 
| 
         | 
|
| 
         | 
|
| 189 | 
         
             
            - /
         
     | 
| 190 | 
         
            +
            - „
         
     | 
| 191 | 
         
             
            - <sos/eos>
         
     | 
| 192 | 
         
             
            odim: null
         
     | 
| 193 | 
         
             
            model_conf: {}
         
     | 
| 
         | 
|
| 197 | 
         
             
            non_linguistic_symbols: null
         
     | 
| 198 | 
         
             
            cleaner: null
         
     | 
| 199 | 
         
             
            g2p: g2p_en
         
     | 
| 200 | 
         
            +
            feats_extract: fbank
         
     | 
| 201 | 
         
             
            feats_extract_conf:
         
     | 
| 202 | 
         
             
                n_fft: 1024
         
     | 
| 203 | 
         
             
                hop_length: 256
         
     | 
| 204 | 
         
             
                win_length: null
         
     | 
| 205 | 
         
            +
                fs: 22050
         
     | 
| 206 | 
         
            +
                fmin: 80
         
     | 
| 207 | 
         
            +
                fmax: 7600
         
     | 
| 208 | 
         
            +
                n_mels: 80
         
     | 
| 209 | 
         
            +
            normalize: global_mvn
         
     | 
| 210 | 
         
            +
            normalize_conf:
         
     | 
| 211 | 
         
            +
                stats_file: feats_stats.npz
         
     | 
| 212 | 
         
            +
            tts: joint_text2wav
         
     | 
| 213 | 
         
             
            tts_conf:
         
     | 
| 214 | 
         
            +
                text2mel_type: tacotron2
         
     | 
| 215 | 
         
            +
                text2mel_params:
         
     | 
| 216 | 
         
            +
                    embed_dim: 512
         
     | 
| 217 | 
         
            +
                    elayers: 1
         
     | 
| 218 | 
         
            +
                    eunits: 512
         
     | 
| 219 | 
         
            +
                    econv_layers: 3
         
     | 
| 220 | 
         
            +
                    econv_chans: 512
         
     | 
| 221 | 
         
            +
                    econv_filts: 5
         
     | 
| 222 | 
         
            +
                    atype: location
         
     | 
| 223 | 
         
            +
                    adim: 512
         
     | 
| 224 | 
         
            +
                    aconv_chans: 32
         
     | 
| 225 | 
         
            +
                    aconv_filts: 15
         
     | 
| 226 | 
         
            +
                    cumulate_att_w: true
         
     | 
| 227 | 
         
            +
                    dlayers: 2
         
     | 
| 228 | 
         
            +
                    dunits: 1024
         
     | 
| 229 | 
         
            +
                    prenet_layers: 2
         
     | 
| 230 | 
         
            +
                    prenet_units: 256
         
     | 
| 231 | 
         
            +
                    postnet_layers: 5
         
     | 
| 232 | 
         
            +
                    postnet_chans: 512
         
     | 
| 233 | 
         
            +
                    postnet_filts: 5
         
     | 
| 234 | 
         
            +
                    output_activation: null
         
     | 
| 235 | 
         
            +
                    use_batch_norm: true
         
     | 
| 236 | 
         
            +
                    use_concate: true
         
     | 
| 237 | 
         
            +
                    use_residual: false
         
     | 
| 238 | 
         
             
                    spk_embed_dim: 192
         
     | 
| 239 | 
         
            +
                    spk_embed_integration_type: add
         
     | 
| 240 | 
         
            +
                    dropout_rate: 0.5
         
     | 
| 241 | 
         
            +
                    zoneout_rate: 0.1
         
     | 
| 242 | 
         
            +
                    reduction_factor: 1
         
     | 
| 243 | 
         
            +
                    use_masking: true
         
     | 
| 244 | 
         
            +
                    bce_pos_weight: 10.0
         
     | 
| 245 | 
         
            +
                    use_guided_attn_loss: true
         
     | 
| 246 | 
         
            +
                    guided_attn_loss_sigma: 0.4
         
     | 
| 247 | 
         
            +
                    guided_attn_loss_lambda: 1.0
         
     | 
| 248 | 
         
            +
                    idim: 48
         
     | 
| 249 | 
         
            +
                    odim: 80
         
     | 
| 250 | 
         
            +
                vocoder_type: hifigan_generator
         
     | 
| 251 | 
         
            +
                vocoder_params:
         
     | 
| 252 | 
         
            +
                    bias: true
         
     | 
| 253 | 
         
            +
                    channels: 512
         
     | 
| 254 | 
         
            +
                    in_channels: 80
         
     | 
| 255 | 
         
            +
                    kernel_size: 7
         
     | 
| 256 | 
         
            +
                    nonlinear_activation: LeakyReLU
         
     | 
| 257 | 
         
            +
                    nonlinear_activation_params:
         
     | 
| 258 | 
         
            +
                        negative_slope: 0.1
         
     | 
| 259 | 
         
            +
                    out_channels: 1
         
     | 
| 260 | 
         
            +
                    resblock_dilations:
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 261 | 
         
             
                    -   - 1
         
     | 
| 262 | 
         
             
                        - 3
         
     | 
| 263 | 
         
             
                        - 5
         
     | 
| 
         | 
|
| 267 | 
         
             
                    -   - 1
         
     | 
| 268 | 
         
             
                        - 3
         
     | 
| 269 | 
         
             
                        - 5
         
     | 
| 270 | 
         
            +
                    resblock_kernel_sizes:
         
     | 
| 271 | 
         
            +
                    - 3
         
     | 
| 272 | 
         
            +
                    - 7
         
     | 
| 273 | 
         
            +
                    - 11
         
     | 
| 274 | 
         
            +
                    upsample_kernel_sizes:
         
     | 
| 275 | 
         
            +
                    - 16
         
     | 
| 276 | 
         
            +
                    - 16
         
     | 
| 277 | 
         
            +
                    - 4
         
     | 
| 278 | 
         
            +
                    - 4
         
     | 
| 279 | 
         
            +
                    upsample_scales:
         
     | 
| 280 | 
         
            +
                    - 8
         
     | 
| 281 | 
         
            +
                    - 8
         
     | 
| 282 | 
         
            +
                    - 2
         
     | 
| 283 | 
         
            +
                    - 2
         
     | 
| 284 | 
         
            +
                    use_additional_convs: true
         
     | 
| 285 | 
         
            +
                    use_weight_norm: true
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 286 | 
         
             
                discriminator_type: hifigan_multi_scale_multi_period_discriminator
         
     | 
| 287 | 
         
             
                discriminator_params:
         
     | 
| 288 | 
         
            +
                    follow_official_norm: true
         
     | 
| 289 | 
         
            +
                    period_discriminator_params:
         
     | 
| 290 | 
         
            +
                        bias: true
         
     | 
| 291 | 
         
            +
                        channels: 32
         
     | 
| 292 | 
         
            +
                        downsample_scales:
         
     | 
| 293 | 
         
            +
                        - 3
         
     | 
| 294 | 
         
            +
                        - 3
         
     | 
| 295 | 
         
            +
                        - 3
         
     | 
| 296 | 
         
            +
                        - 3
         
     | 
| 297 | 
         
            +
                        - 1
         
     | 
| 298 | 
         
             
                        in_channels: 1
         
     | 
| 
         | 
|
| 299 | 
         
             
                        kernel_sizes:
         
     | 
| 
         | 
|
| 
         | 
|
| 300 | 
         
             
                        - 5
         
     | 
| 301 | 
         
             
                        - 3
         
     | 
| 
         | 
|
| 302 | 
         
             
                        max_downsample_channels: 1024
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 303 | 
         
             
                        nonlinear_activation: LeakyReLU
         
     | 
| 304 | 
         
             
                        nonlinear_activation_params:
         
     | 
| 305 | 
         
             
                            negative_slope: 0.1
         
     | 
| 306 | 
         
            +
                        out_channels: 1
         
     | 
| 307 | 
         
             
                        use_spectral_norm: false
         
     | 
| 308 | 
         
            +
                        use_weight_norm: true
         
     | 
| 309 | 
         
             
                    periods:
         
     | 
| 310 | 
         
             
                    - 2
         
     | 
| 311 | 
         
             
                    - 3
         
     | 
| 312 | 
         
             
                    - 5
         
     | 
| 313 | 
         
             
                    - 7
         
     | 
| 314 | 
         
             
                    - 11
         
     | 
| 315 | 
         
            +
                    scale_discriminator_params:
         
     | 
| 316 | 
         
            +
                        bias: true
         
     | 
| 317 | 
         
            +
                        channels: 128
         
     | 
| 318 | 
         
            +
                        downsample_scales:
         
     | 
| 319 | 
         
            +
                        - 4
         
     | 
| 320 | 
         
            +
                        - 4
         
     | 
| 321 | 
         
            +
                        - 4
         
     | 
| 322 | 
         
            +
                        - 4
         
     | 
| 323 | 
         
            +
                        - 1
         
     | 
| 324 | 
         
             
                        in_channels: 1
         
     | 
| 
         | 
|
| 325 | 
         
             
                        kernel_sizes:
         
     | 
| 326 | 
         
            +
                        - 15
         
     | 
| 327 | 
         
            +
                        - 41
         
     | 
| 328 | 
         
             
                        - 5
         
     | 
| 329 | 
         
             
                        - 3
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 330 | 
         
             
                        max_downsample_channels: 1024
         
     | 
| 331 | 
         
            +
                        max_groups: 16
         
     | 
| 332 | 
         
             
                        nonlinear_activation: LeakyReLU
         
     | 
| 333 | 
         
             
                        nonlinear_activation_params:
         
     | 
| 334 | 
         
             
                            negative_slope: 0.1
         
     | 
| 335 | 
         
            +
                        out_channels: 1
         
     | 
| 336 | 
         
            +
                    scale_downsample_pooling: AvgPool1d
         
     | 
| 337 | 
         
            +
                    scale_downsample_pooling_params:
         
     | 
| 338 | 
         
            +
                        kernel_size: 4
         
     | 
| 339 | 
         
            +
                        padding: 2
         
     | 
| 340 | 
         
            +
                        stride: 2
         
     | 
| 341 | 
         
            +
                    scales: 3
         
     | 
| 342 | 
         
             
                generator_adv_loss_params:
         
     | 
| 343 | 
         
             
                    average_by_discriminators: false
         
     | 
| 344 | 
         
             
                    loss_type: mse
         
     | 
| 345 | 
         
             
                discriminator_adv_loss_params:
         
     | 
| 346 | 
         
             
                    average_by_discriminators: false
         
     | 
| 347 | 
         
             
                    loss_type: mse
         
     | 
| 348 | 
         
            +
                use_feat_match_loss: true
         
     | 
| 349 | 
         
             
                feat_match_loss_params:
         
     | 
| 350 | 
         
             
                    average_by_discriminators: false
         
     | 
| 351 | 
         
             
                    average_by_layers: false
         
     | 
| 352 | 
         
             
                    include_final_outputs: true
         
     | 
| 353 | 
         
            +
                use_mel_loss: true
         
     | 
| 354 | 
         
             
                mel_loss_params:
         
     | 
| 355 | 
         
             
                    fs: 22050
         
     | 
| 356 | 
         
             
                    n_fft: 1024
         
     | 
| 
         | 
|
| 361 | 
         
             
                    fmin: 0
         
     | 
| 362 | 
         
             
                    fmax: null
         
     | 
| 363 | 
         
             
                    log_base: null
         
     | 
| 364 | 
         
            +
                lambda_text2mel: 1.0
         
     | 
| 365 | 
         
             
                lambda_adv: 1.0
         
     | 
| 366 | 
         
             
                lambda_mel: 45.0
         
     | 
| 367 | 
         
             
                lambda_feat_match: 2.0
         
     | 
| 
         | 
|
| 
         | 
|
| 368 | 
         
             
                sampling_rate: 22050
         
     | 
| 369 | 
         
            +
                segment_size: 32
         
     | 
| 370 | 
         
             
                cache_generator_outputs: true
         
     | 
| 371 | 
         
             
            pitch_extract: null
         
     | 
| 372 | 
         
             
            pitch_extract_conf: {}
         
     | 
    	
        requirements.txt
    CHANGED
    
    | 
         @@ -1,6 +1,6 @@ 
     | 
|
| 1 | 
         
             
            # requirements for HuggingFace demo. Installs local package.
         
     | 
| 2 | 
         
             
            torch
         
     | 
| 3 | 
         
            -
            espnet 
     | 
| 4 | 
         
             
            typeguard<3 # typeguard 3.0.0 is incompatible with espnet
         
     | 
| 5 | 
         
             
            git+https://github.com/savoirfairelinux/num2words.git@3e39091d052829fc9e65c18176ce7b7ff6169772
         
     | 
| 6 | 
         
             
            ukrainian-word-stress==1.0.2
         
     | 
| 
         | 
|
| 1 | 
         
             
            # requirements for HuggingFace demo. Installs local package.
         
     | 
| 2 | 
         
             
            torch
         
     | 
| 3 | 
         
            +
            espnet==202301
         
     | 
| 4 | 
         
             
            typeguard<3 # typeguard 3.0.0 is incompatible with espnet
         
     | 
| 5 | 
         
             
            git+https://github.com/savoirfairelinux/num2words.git@3e39091d052829fc9e65c18176ce7b7ff6169772
         
     | 
| 6 | 
         
             
            ukrainian-word-stress==1.0.2
         
     | 
    	
        setup.py
    CHANGED
    
    | 
         @@ -3,7 +3,7 @@ from setuptools import setup, find_packages 
     | 
|
| 3 | 
         | 
| 4 | 
         
             
            setup(
         
     | 
| 5 | 
         
             
                name="ukrainian-tts",
         
     | 
| 6 | 
         
            -
                version=" 
     | 
| 7 | 
         
             
                description="Ukrainian TTS using ESPNET",
         
     | 
| 8 | 
         
             
                author="Yurii Paniv",
         
     | 
| 9 | 
         
             
                author_email="[email protected]",
         
     | 
| 
         @@ -12,7 +12,7 @@ setup( 
     | 
|
| 12 | 
         
             
                packages=find_packages(),
         
     | 
| 13 | 
         
             
                python_requires=">3.6.0",
         
     | 
| 14 | 
         
             
                install_requires=[
         
     | 
| 15 | 
         
            -
                    "espnet 
     | 
| 16 | 
         
             
                    "typeguard<3",
         
     | 
| 17 | 
         
             
                    "num2words @ git+https://github.com/savoirfairelinux/num2words.git@3e39091d052829fc9e65c18176ce7b7ff6169772",
         
     | 
| 18 | 
         
             
                    "ukrainian-word-stress==1.0.2",
         
     | 
| 
         | 
|
| 3 | 
         | 
| 4 | 
         
             
            setup(
         
     | 
| 5 | 
         
             
                name="ukrainian-tts",
         
     | 
| 6 | 
         
            +
                version="6.0",
         
     | 
| 7 | 
         
             
                description="Ukrainian TTS using ESPNET",
         
     | 
| 8 | 
         
             
                author="Yurii Paniv",
         
     | 
| 9 | 
         
             
                author_email="[email protected]",
         
     | 
| 
         | 
|
| 12 | 
         
             
                packages=find_packages(),
         
     | 
| 13 | 
         
             
                python_requires=">3.6.0",
         
     | 
| 14 | 
         
             
                install_requires=[
         
     | 
| 15 | 
         
            +
                    "espnet==202301",
         
     | 
| 16 | 
         
             
                    "typeguard<3",
         
     | 
| 17 | 
         
             
                    "num2words @ git+https://github.com/savoirfairelinux/num2words.git@3e39091d052829fc9e65c18176ce7b7ff6169772",
         
     | 
| 18 | 
         
             
                    "ukrainian-word-stress==1.0.2",
         
     | 
    	
        ukrainian_tts/tts.py
    CHANGED
    
    | 
         @@ -19,6 +19,7 @@ class Voices(Enum): 
     | 
|
| 19 | 
         
             
                Mykyta = "mykyta"
         
     | 
| 20 | 
         
             
                Lada = "lada"
         
     | 
| 21 | 
         
             
                Dmytro = "dmytro"
         
     | 
| 
         | 
|
| 22 | 
         | 
| 23 | 
         | 
| 24 | 
         
             
            class Stress(Enum):
         
     | 
| 
         @@ -41,7 +42,7 @@ class TTS: 
     | 
|
| 41 | 
         
             
                    self.device = device
         
     | 
| 42 | 
         
             
                    self.__setup_cache(cache_folder)
         
     | 
| 43 | 
         | 
| 44 | 
         
            -
                def tts(self, text: str, voice: str, stress: str, output_fp=BytesIO() 
     | 
| 45 | 
         
             
                    """
         
     | 
| 46 | 
         
             
                    Run a Text-to-Speech engine and output to `output_fp` BytesIO-like object.
         
     | 
| 47 | 
         
             
                    - `text` - your model input text.
         
     | 
| 
         @@ -71,9 +72,7 @@ class TTS: 
     | 
|
| 71 | 
         
             
                    # synthesis
         
     | 
| 72 | 
         
             
                    with no_grad():
         
     | 
| 73 | 
         
             
                        start = time.time()
         
     | 
| 74 | 
         
            -
                        wav = self.synthesizer(
         
     | 
| 75 | 
         
            -
                            text, spembs=self.xvectors[voice][0], decode_conf={"alpha": 1 / speed}
         
     | 
| 76 | 
         
            -
                        )["wav"]
         
     | 
| 77 | 
         | 
| 78 | 
         
             
                    rtf = (time.time() - start) / (len(wav) / self.synthesizer.fs)
         
     | 
| 79 | 
         
             
                    print(f"RTF = {rtf:5f}")
         
     | 
| 
         @@ -99,6 +98,7 @@ class TTS: 
     | 
|
| 99 | 
         
             
                    model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model.pth"
         
     | 
| 100 | 
         
             
                    config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.yaml"
         
     | 
| 101 | 
         
             
                    speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/spk_xvector.ark"
         
     | 
| 
         | 
|
| 102 | 
         | 
| 103 | 
         
             
                    if cache_folder is None:
         
     | 
| 104 | 
         
             
                        cache_folder = "."
         
     | 
| 
         @@ -106,19 +106,16 @@ class TTS: 
     | 
|
| 106 | 
         
             
                    model_path = join(cache_folder, "model.pth")
         
     | 
| 107 | 
         
             
                    config_path = join(cache_folder, "config.yaml")
         
     | 
| 108 | 
         
             
                    speakers_path = join(cache_folder, "spk_xvector.ark")
         
     | 
| 
         | 
|
| 109 | 
         | 
| 110 | 
         
             
                    self.__download(model_link, model_path)
         
     | 
| 111 | 
         
             
                    self.__download(config_link, config_path)
         
     | 
| 112 | 
         
             
                    self.__download(speakers_link, speakers_path)
         
     | 
| 
         | 
|
| 113 | 
         
             
                    print("downloaded.")
         
     | 
| 114 | 
         | 
| 115 | 
         
             
                    self.synthesizer = Text2Speech(
         
     | 
| 116 | 
         
            -
                        train_config=config_path,
         
     | 
| 117 | 
         
            -
                        model_file=model_path,
         
     | 
| 118 | 
         
            -
                        device=self.device,
         
     | 
| 119 | 
         
            -
                        # Only for VITS
         
     | 
| 120 | 
         
            -
                        noise_scale=0.333,
         
     | 
| 121 | 
         
            -
                        noise_scale_dur=0.333,
         
     | 
| 122 | 
         
             
                    )
         
     | 
| 123 | 
         
             
                    self.xvectors = {k: v for k, v in load_ark(speakers_path)}
         
     | 
| 124 | 
         | 
| 
         | 
|
| 19 | 
         
             
                Mykyta = "mykyta"
         
     | 
| 20 | 
         
             
                Lada = "lada"
         
     | 
| 21 | 
         
             
                Dmytro = "dmytro"
         
     | 
| 22 | 
         
            +
                Oleksa = "oleksa"
         
     | 
| 23 | 
         | 
| 24 | 
         | 
| 25 | 
         
             
            class Stress(Enum):
         
     | 
| 
         | 
|
| 42 | 
         
             
                    self.device = device
         
     | 
| 43 | 
         
             
                    self.__setup_cache(cache_folder)
         
     | 
| 44 | 
         | 
| 45 | 
         
            +
                def tts(self, text: str, voice: str, stress: str, output_fp=BytesIO()):
         
     | 
| 46 | 
         
             
                    """
         
     | 
| 47 | 
         
             
                    Run a Text-to-Speech engine and output to `output_fp` BytesIO-like object.
         
     | 
| 48 | 
         
             
                    - `text` - your model input text.
         
     | 
| 
         | 
|
| 72 | 
         
             
                    # synthesis
         
     | 
| 73 | 
         
             
                    with no_grad():
         
     | 
| 74 | 
         
             
                        start = time.time()
         
     | 
| 75 | 
         
            +
                        wav = self.synthesizer(text, spembs=self.xvectors[voice][0])["wav"]
         
     | 
| 
         | 
|
| 
         | 
|
| 76 | 
         | 
| 77 | 
         
             
                    rtf = (time.time() - start) / (len(wav) / self.synthesizer.fs)
         
     | 
| 78 | 
         
             
                    print(f"RTF = {rtf:5f}")
         
     | 
| 
         | 
|
| 98 | 
         
             
                    model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model.pth"
         
     | 
| 99 | 
         
             
                    config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.yaml"
         
     | 
| 100 | 
         
             
                    speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/spk_xvector.ark"
         
     | 
| 101 | 
         
            +
                    feat_stats_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/feat_stats.npz"
         
     | 
| 102 | 
         | 
| 103 | 
         
             
                    if cache_folder is None:
         
     | 
| 104 | 
         
             
                        cache_folder = "."
         
     | 
| 
         | 
|
| 106 | 
         
             
                    model_path = join(cache_folder, "model.pth")
         
     | 
| 107 | 
         
             
                    config_path = join(cache_folder, "config.yaml")
         
     | 
| 108 | 
         
             
                    speakers_path = join(cache_folder, "spk_xvector.ark")
         
     | 
| 109 | 
         
            +
                    feat_stats_path = join(cache_folder, "feats_stats.npz")
         
     | 
| 110 | 
         | 
| 111 | 
         
             
                    self.__download(model_link, model_path)
         
     | 
| 112 | 
         
             
                    self.__download(config_link, config_path)
         
     | 
| 113 | 
         
             
                    self.__download(speakers_link, speakers_path)
         
     | 
| 114 | 
         
            +
                    self.__download(feat_stats_link, feat_stats_path)
         
     | 
| 115 | 
         
             
                    print("downloaded.")
         
     | 
| 116 | 
         | 
| 117 | 
         
             
                    self.synthesizer = Text2Speech(
         
     | 
| 118 | 
         
            +
                        train_config=config_path, model_file=model_path, device=self.device
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 119 | 
         
             
                    )
         
     | 
| 120 | 
         
             
                    self.xvectors = {k: v for k, v in load_ark(speakers_path)}
         
     | 
| 121 | 
         |