Upload 159 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +37 -0
- .gitignore +2 -0
- .project-root +0 -0
- README.md +12 -0
- app.py +52 -0
- checkpoints/fish-speech-1.4-sft-yth-lora/config.json +21 -0
- checkpoints/fish-speech-1.4-sft-yth-lora/model.pth +3 -0
- checkpoints/fish-speech-1.4-sft-yth-lora/special_tokens_map.json +23 -0
- checkpoints/fish-speech-1.4-sft-yth-lora/tokenizer.json +0 -0
- checkpoints/fish-speech-1.4-sft-yth-lora/tokenizer_config.json +82 -0
- checkpoints/fish-speech-1.4/README.md +61 -0
- checkpoints/fish-speech-1.4/config.json +21 -0
- checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth +3 -0
- checkpoints/fish-speech-1.4/model.pth +3 -0
- checkpoints/fish-speech-1.4/special_tokens_map.json +23 -0
- checkpoints/fish-speech-1.4/tokenizer.json +0 -0
- checkpoints/fish-speech-1.4/tokenizer_config.json +82 -0
- configs/base.yaml +87 -0
- configs/firefly_gan_vq.yaml +33 -0
- configs/lora/r_8_alpha_16.yaml +4 -0
- configs/text2semantic_finetune.yaml +83 -0
- examples/40_matthew-001-01.lab +1 -0
- examples/40_matthew-001-01.wav +3 -0
- fish_speech/callbacks/__init__.py +3 -0
- fish_speech/callbacks/__pycache__/__init__.cpython-310.pyc +0 -0
- fish_speech/callbacks/__pycache__/grad_norm.cpython-310.pyc +0 -0
- fish_speech/callbacks/grad_norm.py +113 -0
- fish_speech/configs/base.yaml +87 -0
- fish_speech/configs/firefly_gan_vq.yaml +33 -0
- fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
- fish_speech/configs/text2semantic_finetune.yaml +83 -0
- fish_speech/conversation.py +2 -0
- fish_speech/datasets/__pycache__/semantic.cpython-310.pyc +0 -0
- fish_speech/datasets/concat_repeat.py +53 -0
- fish_speech/datasets/protos/__pycache__/text_data_pb2.cpython-310.pyc +0 -0
- fish_speech/datasets/protos/__pycache__/text_data_stream.cpython-310.pyc +0 -0
- fish_speech/datasets/protos/text-data.proto +24 -0
- fish_speech/datasets/protos/text_data_pb2.py +33 -0
- fish_speech/datasets/protos/text_data_stream.py +36 -0
- fish_speech/datasets/semantic.py +496 -0
- fish_speech/datasets/vqgan.py +147 -0
- fish_speech/i18n/README.md +27 -0
- fish_speech/i18n/__init__.py +3 -0
- fish_speech/i18n/__pycache__/__init__.cpython-310.pyc +0 -0
- fish_speech/i18n/__pycache__/core.cpython-310.pyc +0 -0
- fish_speech/i18n/core.py +40 -0
- fish_speech/i18n/locale/en_US.json +122 -0
- fish_speech/i18n/locale/es_ES.json +122 -0
- fish_speech/i18n/locale/ja_JP.json +123 -0
- fish_speech/i18n/locale/pt_BR.json +133 -0
.gitattributes
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
excemple/40_matthew-001-01.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
examples/40_matthew-001-01.wav filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
checkpoints
|
.project-root
ADDED
File without changes
|
README.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Demo
|
3 |
+
emoji: 👁
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: gray
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.0.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import subprocess
|
3 |
+
import torch
|
4 |
+
import spaces
|
5 |
+
from transformers import AutoTokenizer,AutoModelForCausalLM
|
6 |
+
|
7 |
+
@torch.no_grad()
|
8 |
+
def main():
|
9 |
+
# 设置命令行参数解析器
|
10 |
+
parser = argparse.ArgumentParser(description="启动 WebUI")
|
11 |
+
parser.add_argument(
|
12 |
+
"--llama-checkpoint-path",
|
13 |
+
type=str,
|
14 |
+
default="checkpoints/fish-speech-1.4-sft-yth-lora",
|
15 |
+
help="Llama 检查点路径",
|
16 |
+
)
|
17 |
+
parser.add_argument(
|
18 |
+
"--decoder-checkpoint-path",
|
19 |
+
type=str,
|
20 |
+
default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
|
21 |
+
help="解码器检查点路径",
|
22 |
+
)
|
23 |
+
parser.add_argument(
|
24 |
+
"--decoder-config-name",
|
25 |
+
type=str,
|
26 |
+
default="firefly_gan_vq",
|
27 |
+
help="解码器配置名称",
|
28 |
+
)
|
29 |
+
parser.add_argument(
|
30 |
+
"--device",
|
31 |
+
type=str,
|
32 |
+
default="cpu",
|
33 |
+
help="设备类型",
|
34 |
+
)
|
35 |
+
|
36 |
+
# 解析命令行参数
|
37 |
+
args = parser.parse_args()
|
38 |
+
|
39 |
+
# 启动 WebUI
|
40 |
+
subprocess.run([
|
41 |
+
"python",
|
42 |
+
"tools/webui.py",
|
43 |
+
"--llama-checkpoint-path", args.llama_checkpoint_path,
|
44 |
+
"--decoder-checkpoint-path", args.decoder_checkpoint_path,
|
45 |
+
"--decoder-config-name", args.decoder_config_name,
|
46 |
+
"--device", args.device,
|
47 |
+
])
|
48 |
+
|
49 |
+
if __name__ == "__main__":
|
50 |
+
|
51 |
+
main()
|
52 |
+
|
checkpoints/fish-speech-1.4-sft-yth-lora/config.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"attention_qkv_bias": false,
|
3 |
+
"codebook_size": 1024,
|
4 |
+
"dim": 1024,
|
5 |
+
"dropout": 0.1,
|
6 |
+
"head_dim": 64,
|
7 |
+
"initializer_range": 0.02,
|
8 |
+
"intermediate_size": 4096,
|
9 |
+
"max_seq_len": 4096,
|
10 |
+
"model_type": "dual_ar",
|
11 |
+
"n_fast_layer": 4,
|
12 |
+
"n_head": 16,
|
13 |
+
"n_layer": 24,
|
14 |
+
"n_local_heads": 2,
|
15 |
+
"norm_eps": 1e-06,
|
16 |
+
"num_codebooks": 8,
|
17 |
+
"rope_base": 1000000.0,
|
18 |
+
"tie_word_embeddings": false,
|
19 |
+
"use_gradient_checkpointing": true,
|
20 |
+
"vocab_size": 32000
|
21 |
+
}
|
checkpoints/fish-speech-1.4-sft-yth-lora/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:25a27344ad35a0514f9cd60869276d2b5bf23dde6b39fe8a0421050e06984246
|
3 |
+
size 988997246
|
checkpoints/fish-speech-1.4-sft-yth-lora/special_tokens_map.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|begin_of_sequence|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|end_of_sequence|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<|pad|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
}
|
23 |
+
}
|
checkpoints/fish-speech-1.4-sft-yth-lora/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
checkpoints/fish-speech-1.4-sft-yth-lora/tokenizer_config.json
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<|begin_of_sequence|>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<|end_of_sequence|>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "<|pad|>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<|im_start|>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"4": {
|
36 |
+
"content": "<|im_end|>",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
},
|
43 |
+
"5": {
|
44 |
+
"content": "<|semantic|>",
|
45 |
+
"lstrip": false,
|
46 |
+
"normalized": false,
|
47 |
+
"rstrip": false,
|
48 |
+
"single_word": false,
|
49 |
+
"special": true
|
50 |
+
},
|
51 |
+
"6": {
|
52 |
+
"content": "<|mel|>",
|
53 |
+
"lstrip": false,
|
54 |
+
"normalized": false,
|
55 |
+
"rstrip": false,
|
56 |
+
"single_word": false,
|
57 |
+
"special": true
|
58 |
+
},
|
59 |
+
"32000": {
|
60 |
+
"content": "<|reserve_0|>",
|
61 |
+
"lstrip": false,
|
62 |
+
"normalized": false,
|
63 |
+
"rstrip": false,
|
64 |
+
"single_word": false,
|
65 |
+
"special": true
|
66 |
+
},
|
67 |
+
"32001": {
|
68 |
+
"content": "<|reserve_1|>",
|
69 |
+
"lstrip": false,
|
70 |
+
"normalized": false,
|
71 |
+
"rstrip": false,
|
72 |
+
"single_word": false,
|
73 |
+
"special": true
|
74 |
+
}
|
75 |
+
},
|
76 |
+
"bos_token": "<|begin_of_sequence|>",
|
77 |
+
"clean_up_tokenization_spaces": true,
|
78 |
+
"eos_token": "<|end_of_sequence|>",
|
79 |
+
"model_max_length": 1000000000000000019884624838656,
|
80 |
+
"pad_token": "<|pad|>",
|
81 |
+
"tokenizer_class": "PreTrainedTokenizerFast"
|
82 |
+
}
|
checkpoints/fish-speech-1.4/README.md
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- text-to-speech
|
4 |
+
license: cc-by-nc-sa-4.0
|
5 |
+
language:
|
6 |
+
- zh
|
7 |
+
- en
|
8 |
+
- de
|
9 |
+
- ja
|
10 |
+
- fr
|
11 |
+
- es
|
12 |
+
- ko
|
13 |
+
- ar
|
14 |
+
pipeline_tag: text-to-speech
|
15 |
+
inference: false
|
16 |
+
extra_gated_prompt: >-
|
17 |
+
You agree to not use the model to generate contents that violate DMCA or local
|
18 |
+
laws.
|
19 |
+
extra_gated_fields:
|
20 |
+
Country: country
|
21 |
+
Specific date: date_picker
|
22 |
+
I agree to use this model for non-commercial use ONLY: checkbox
|
23 |
+
---
|
24 |
+
|
25 |
+
|
26 |
+
# Fish Speech V1.4
|
27 |
+
|
28 |
+
**Fish Speech V1.4** is a leading text-to-speech (TTS) model trained on 700k hours of audio data in multiple languages.
|
29 |
+
|
30 |
+
Supported languages:
|
31 |
+
- English (en) ~300k hours
|
32 |
+
- Chinese (zh) ~300k hours
|
33 |
+
- German (de) ~20k hours
|
34 |
+
- Japanese (ja) ~20k hours
|
35 |
+
- French (fr) ~20k hours
|
36 |
+
- Spanish (es) ~20k hours
|
37 |
+
- Korean (ko) ~20k hours
|
38 |
+
- Arabic (ar) ~20k hours
|
39 |
+
|
40 |
+
Please refer to [Fish Speech Github](https://github.com/fishaudio/fish-speech) for more info.
|
41 |
+
Demo available at [Fish Audio](https://fish.audio/).
|
42 |
+
|
43 |
+
## Citation
|
44 |
+
|
45 |
+
If you found this repository useful, please consider citing this work:
|
46 |
+
|
47 |
+
```
|
48 |
+
@misc{fish-speech-v1.4,
|
49 |
+
author = {Shijia Liao, Tianyu Li, etc},
|
50 |
+
title = {Fish Speech V1.4},
|
51 |
+
year = {2024},
|
52 |
+
publisher = {GitHub},
|
53 |
+
journal = {GitHub repository},
|
54 |
+
howpublished = {\url{https://github.com/fishaudio/fish-speech}}
|
55 |
+
}
|
56 |
+
```
|
57 |
+
|
58 |
+
## License
|
59 |
+
|
60 |
+
This model is permissively licensed under the BY-CC-NC-SA-4.0 license.
|
61 |
+
The source code is released under BSD-3-Clause license.
|
checkpoints/fish-speech-1.4/config.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"attention_qkv_bias": false,
|
3 |
+
"codebook_size": 1024,
|
4 |
+
"dim": 1024,
|
5 |
+
"dropout": 0.1,
|
6 |
+
"head_dim": 64,
|
7 |
+
"initializer_range": 0.02,
|
8 |
+
"intermediate_size": 4096,
|
9 |
+
"max_seq_len": 4096,
|
10 |
+
"model_type": "dual_ar",
|
11 |
+
"n_fast_layer": 4,
|
12 |
+
"n_head": 16,
|
13 |
+
"n_layer": 24,
|
14 |
+
"n_local_heads": 2,
|
15 |
+
"norm_eps": 1e-06,
|
16 |
+
"num_codebooks": 8,
|
17 |
+
"rope_base": 1000000.0,
|
18 |
+
"tie_word_embeddings": false,
|
19 |
+
"use_gradient_checkpointing": true,
|
20 |
+
"vocab_size": 32000
|
21 |
+
}
|
checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:01b81dbf753224a156c3fe139b88bf0b9a0f54b11bee864f95e66511c3ccd754
|
3 |
+
size 188518579
|
checkpoints/fish-speech-1.4/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7d1cfa4b59c37f58d22e0626a53cec61db79390d7d0733b6402bf6f69fe58b93
|
3 |
+
size 988988542
|
checkpoints/fish-speech-1.4/special_tokens_map.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|begin_of_sequence|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|end_of_sequence|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<|pad|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
}
|
23 |
+
}
|
checkpoints/fish-speech-1.4/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
checkpoints/fish-speech-1.4/tokenizer_config.json
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<|begin_of_sequence|>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<|end_of_sequence|>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "<|pad|>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<|im_start|>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"4": {
|
36 |
+
"content": "<|im_end|>",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
},
|
43 |
+
"5": {
|
44 |
+
"content": "<|semantic|>",
|
45 |
+
"lstrip": false,
|
46 |
+
"normalized": false,
|
47 |
+
"rstrip": false,
|
48 |
+
"single_word": false,
|
49 |
+
"special": true
|
50 |
+
},
|
51 |
+
"6": {
|
52 |
+
"content": "<|mel|>",
|
53 |
+
"lstrip": false,
|
54 |
+
"normalized": false,
|
55 |
+
"rstrip": false,
|
56 |
+
"single_word": false,
|
57 |
+
"special": true
|
58 |
+
},
|
59 |
+
"7": {
|
60 |
+
"content": "<|reserve_0|>",
|
61 |
+
"lstrip": false,
|
62 |
+
"normalized": false,
|
63 |
+
"rstrip": false,
|
64 |
+
"single_word": false,
|
65 |
+
"special": true
|
66 |
+
},
|
67 |
+
"8": {
|
68 |
+
"content": "<|reserve_1|>",
|
69 |
+
"lstrip": false,
|
70 |
+
"normalized": false,
|
71 |
+
"rstrip": false,
|
72 |
+
"single_word": false,
|
73 |
+
"special": true
|
74 |
+
}
|
75 |
+
},
|
76 |
+
"bos_token": "<|begin_of_sequence|>",
|
77 |
+
"clean_up_tokenization_spaces": true,
|
78 |
+
"eos_token": "<|end_of_sequence|>",
|
79 |
+
"model_max_length": 1000000000000000019884624838656,
|
80 |
+
"pad_token": "<|pad|>",
|
81 |
+
"tokenizer_class": "PreTrainedTokenizerFast"
|
82 |
+
}
|
configs/base.yaml
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Base configuration for training a model
|
2 |
+
paths:
|
3 |
+
run_dir: results/${project}
|
4 |
+
ckpt_dir: ${paths.run_dir}/checkpoints
|
5 |
+
|
6 |
+
hydra:
|
7 |
+
run:
|
8 |
+
dir: ${paths.run_dir}
|
9 |
+
|
10 |
+
# Lightning Trainer
|
11 |
+
trainer:
|
12 |
+
_target_: lightning.pytorch.trainer.Trainer
|
13 |
+
|
14 |
+
default_root_dir: ${paths.run_dir}
|
15 |
+
accelerator: gpu
|
16 |
+
num_nodes: 1
|
17 |
+
devices: auto
|
18 |
+
strategy:
|
19 |
+
_target_: lightning.pytorch.strategies.DDPStrategy
|
20 |
+
process_group_backend: nccl # This should be override when training on windows
|
21 |
+
|
22 |
+
precision: bf16-mixed
|
23 |
+
|
24 |
+
# disable validation by epoch end
|
25 |
+
check_val_every_n_epoch: null
|
26 |
+
val_check_interval: 5000
|
27 |
+
max_steps: 100_000
|
28 |
+
|
29 |
+
# Use torch.backends.cudnn.benchmark to speed up training
|
30 |
+
benchmark: true
|
31 |
+
|
32 |
+
# Callbacks
|
33 |
+
callbacks:
|
34 |
+
model_checkpoint:
|
35 |
+
_target_: lightning.pytorch.callbacks.ModelCheckpoint
|
36 |
+
dirpath: ${paths.ckpt_dir}
|
37 |
+
filename: "step_{step:09d}"
|
38 |
+
save_last: false # additionally always save an exact copy of the last checkpoint to a file last.ckpt
|
39 |
+
save_top_k: 5 # save 5 latest checkpoints
|
40 |
+
monitor: step # use step to monitor checkpoints
|
41 |
+
mode: max # save the latest checkpoint with the highest global_step
|
42 |
+
every_n_epochs: null # don't save checkpoints by epoch end
|
43 |
+
every_n_train_steps: 5000 # save checkpoints every 5000 steps
|
44 |
+
auto_insert_metric_name: false
|
45 |
+
|
46 |
+
model_summary:
|
47 |
+
_target_: lightning.pytorch.callbacks.ModelSummary
|
48 |
+
max_depth: 2 # the maximum depth of layer nesting that the summary will include
|
49 |
+
|
50 |
+
learning_rate_monitor:
|
51 |
+
_target_: lightning.pytorch.callbacks.LearningRateMonitor
|
52 |
+
logging_interval: step
|
53 |
+
log_momentum: false
|
54 |
+
|
55 |
+
grad_norm_monitor:
|
56 |
+
_target_: fish_speech.callbacks.GradNormMonitor
|
57 |
+
norm_type: 2
|
58 |
+
logging_interval: step
|
59 |
+
|
60 |
+
# Logger
|
61 |
+
logger:
|
62 |
+
tensorboard:
|
63 |
+
_target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
|
64 |
+
save_dir: "${paths.run_dir}/tensorboard/"
|
65 |
+
name: null
|
66 |
+
log_graph: false
|
67 |
+
default_hp_metric: true
|
68 |
+
prefix: ""
|
69 |
+
|
70 |
+
# wandb:
|
71 |
+
# _target_: lightning.pytorch.loggers.wandb.WandbLogger
|
72 |
+
# # name: "" # name of the run (normally generated by wandb)
|
73 |
+
# save_dir: "${paths.run_dir}"
|
74 |
+
# offline: False
|
75 |
+
# id: null # pass correct id to resume experiment!
|
76 |
+
# anonymous: null # enable anonymous logging
|
77 |
+
# project: "fish-speech"
|
78 |
+
# log_model: False # upload lightning ckpts
|
79 |
+
# prefix: "" # a string to put at the beginning of metric keys
|
80 |
+
# # entity: "" # set to name of your wandb team
|
81 |
+
# group: ""
|
82 |
+
# tags: ["vq", "hq", "finetune"]
|
83 |
+
# job_type: ""
|
84 |
+
|
85 |
+
# Loop
|
86 |
+
train: true
|
87 |
+
test: false
|
configs/firefly_gan_vq.yaml
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
|
2 |
+
spec_transform:
|
3 |
+
_target_: fish_speech.utils.spectrogram.LogMelSpectrogram
|
4 |
+
sample_rate: 44100
|
5 |
+
n_mels: 160
|
6 |
+
n_fft: 2048
|
7 |
+
hop_length: 512
|
8 |
+
win_length: 2048
|
9 |
+
backbone:
|
10 |
+
_target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
|
11 |
+
input_channels: 160
|
12 |
+
depths: [3, 3, 9, 3]
|
13 |
+
dims: [128, 256, 384, 512]
|
14 |
+
drop_path_rate: 0.2
|
15 |
+
kernel_size: 7
|
16 |
+
head:
|
17 |
+
_target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
|
18 |
+
hop_length: 512
|
19 |
+
upsample_rates: [8, 8, 2, 2, 2] # aka. strides
|
20 |
+
upsample_kernel_sizes: [16, 16, 4, 4, 4]
|
21 |
+
resblock_kernel_sizes: [3, 7, 11]
|
22 |
+
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
23 |
+
num_mels: 512
|
24 |
+
upsample_initial_channel: 512
|
25 |
+
pre_conv_kernel_size: 13
|
26 |
+
post_conv_kernel_size: 13
|
27 |
+
quantizer:
|
28 |
+
_target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
|
29 |
+
input_dim: 512
|
30 |
+
n_groups: 8
|
31 |
+
n_codebooks: 1
|
32 |
+
levels: [8, 5, 5, 5]
|
33 |
+
downsample_factor: [2, 2]
|
configs/lora/r_8_alpha_16.yaml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_target_: fish_speech.models.text2semantic.lora.LoraConfig
|
2 |
+
r: 8
|
3 |
+
lora_alpha: 16
|
4 |
+
lora_dropout: 0.01
|
configs/text2semantic_finetune.yaml
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
defaults:
|
2 |
+
- base
|
3 |
+
- _self_
|
4 |
+
|
5 |
+
project: text2semantic_finetune_dual_ar
|
6 |
+
max_length: 4096
|
7 |
+
pretrained_ckpt_path: checkpoints/fish-speech-1.4
|
8 |
+
|
9 |
+
# Lightning Trainer
|
10 |
+
trainer:
|
11 |
+
accumulate_grad_batches: 1
|
12 |
+
gradient_clip_val: 1.0
|
13 |
+
gradient_clip_algorithm: "norm"
|
14 |
+
max_steps: 1000
|
15 |
+
precision: bf16-true
|
16 |
+
limit_val_batches: 10
|
17 |
+
val_check_interval: 100
|
18 |
+
|
19 |
+
# Dataset Configuration
|
20 |
+
tokenizer:
|
21 |
+
_target_: transformers.AutoTokenizer.from_pretrained
|
22 |
+
pretrained_model_name_or_path: ${pretrained_ckpt_path}
|
23 |
+
|
24 |
+
# Dataset Configuration
|
25 |
+
train_dataset:
|
26 |
+
_target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
|
27 |
+
proto_files:
|
28 |
+
- data/protos
|
29 |
+
tokenizer: ${tokenizer}
|
30 |
+
causal: true
|
31 |
+
max_length: ${max_length}
|
32 |
+
use_speaker: false
|
33 |
+
interactive_prob: 0.7
|
34 |
+
|
35 |
+
val_dataset:
|
36 |
+
_target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
|
37 |
+
proto_files:
|
38 |
+
- data/protos
|
39 |
+
tokenizer: ${tokenizer}
|
40 |
+
causal: true
|
41 |
+
max_length: ${max_length}
|
42 |
+
use_speaker: false
|
43 |
+
interactive_prob: 0.7
|
44 |
+
|
45 |
+
data:
|
46 |
+
_target_: fish_speech.datasets.semantic.SemanticDataModule
|
47 |
+
train_dataset: ${train_dataset}
|
48 |
+
val_dataset: ${val_dataset}
|
49 |
+
num_workers: 4
|
50 |
+
batch_size: 8
|
51 |
+
tokenizer: ${tokenizer}
|
52 |
+
max_length: ${max_length}
|
53 |
+
|
54 |
+
# Model Configuration
|
55 |
+
model:
|
56 |
+
_target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
|
57 |
+
model:
|
58 |
+
_target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
|
59 |
+
path: ${pretrained_ckpt_path}
|
60 |
+
load_weights: true
|
61 |
+
max_length: ${max_length}
|
62 |
+
lora_config: null
|
63 |
+
|
64 |
+
optimizer:
|
65 |
+
_target_: torch.optim.AdamW
|
66 |
+
_partial_: true
|
67 |
+
lr: 1e-4
|
68 |
+
weight_decay: 0
|
69 |
+
betas: [0.9, 0.95]
|
70 |
+
eps: 1e-5
|
71 |
+
|
72 |
+
lr_scheduler:
|
73 |
+
_target_: torch.optim.lr_scheduler.LambdaLR
|
74 |
+
_partial_: true
|
75 |
+
lr_lambda:
|
76 |
+
_target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
|
77 |
+
_partial_: true
|
78 |
+
num_warmup_steps: 10
|
79 |
+
|
80 |
+
# Callbacks
|
81 |
+
callbacks:
|
82 |
+
model_checkpoint:
|
83 |
+
every_n_train_steps: ${trainer.val_check_interval}
|
examples/40_matthew-001-01.lab
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Miyan qaniy qu binkgan bbinkesan na Yesu:Yesu Kristo ga kinbahan na Tabite, Tabite ga kinbahan na Aburaham.
|
examples/40_matthew-001-01.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:173b0800783e09808e2a1c74f0a8f85fbccb705289986f5dd63ef60a821ea805
|
3 |
+
size 1200330
|
fish_speech/callbacks/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .grad_norm import GradNormMonitor
|
2 |
+
|
3 |
+
__all__ = ["GradNormMonitor"]
|
fish_speech/callbacks/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (239 Bytes). View file
|
|
fish_speech/callbacks/__pycache__/grad_norm.cpython-310.pyc
ADDED
Binary file (3.79 kB). View file
|
|
fish_speech/callbacks/grad_norm.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional, Union
|
2 |
+
|
3 |
+
import lightning.pytorch as pl
|
4 |
+
import torch
|
5 |
+
from lightning import LightningModule, Trainer
|
6 |
+
from lightning.pytorch.callbacks import Callback
|
7 |
+
from torch import Tensor, nn
|
8 |
+
from torch.utils._foreach_utils import (
|
9 |
+
_group_tensors_by_device_and_dtype,
|
10 |
+
_has_foreach_support,
|
11 |
+
)
|
12 |
+
|
13 |
+
|
14 |
+
@torch.no_grad()
|
15 |
+
def grad_norm(
|
16 |
+
parameters: Union[Tensor, list[Tensor]],
|
17 |
+
norm_type: float = 2.0,
|
18 |
+
) -> float:
|
19 |
+
"""
|
20 |
+
Returns the norm of the gradients of the given parameters.
|
21 |
+
|
22 |
+
Args:
|
23 |
+
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
|
24 |
+
single Tensor that will have gradients normalized
|
25 |
+
norm_type (float): type of the used p-norm.
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
Total norm of the parameter gradients (viewed as a single vector).
|
29 |
+
""" # noqa: E501
|
30 |
+
|
31 |
+
if isinstance(parameters, Tensor):
|
32 |
+
parameters = [parameters]
|
33 |
+
|
34 |
+
grads = [p.grad for p in parameters if p.grad is not None]
|
35 |
+
if len(grads) == 0:
|
36 |
+
return None
|
37 |
+
|
38 |
+
first_device = grads[0].device
|
39 |
+
grouped_grads: dict[
|
40 |
+
tuple[torch.device, torch.dtype], list[list[Tensor]]
|
41 |
+
] = _group_tensors_by_device_and_dtype(
|
42 |
+
[[g.detach() for g in grads]]
|
43 |
+
) # type: ignore[assignment]
|
44 |
+
|
45 |
+
norms = []
|
46 |
+
for (device, _), ([grads], _) in grouped_grads.items():
|
47 |
+
if _has_foreach_support(grads, device=device):
|
48 |
+
norms.extend(torch._foreach_norm(grads, norm_type))
|
49 |
+
else:
|
50 |
+
norms.extend([torch.norm(g, norm_type) for g in grads])
|
51 |
+
|
52 |
+
return torch.norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type)
|
53 |
+
|
54 |
+
|
55 |
+
class GradNormMonitor(Callback):
|
56 |
+
"""
|
57 |
+
Callback that computes the gradient norm of the model parameters.
|
58 |
+
"""
|
59 |
+
|
60 |
+
def __init__(
|
61 |
+
self,
|
62 |
+
norm_type: float = 2.0,
|
63 |
+
logging_interval: str = "step",
|
64 |
+
sub_module: Optional[Union[str, list[str]]] = None,
|
65 |
+
) -> None:
|
66 |
+
"""
|
67 |
+
Args:
|
68 |
+
norm_type (float): type of the used p-norm.
|
69 |
+
logging_interval (str): "step" or "epoch".
|
70 |
+
"""
|
71 |
+
super().__init__()
|
72 |
+
|
73 |
+
self.norm_type = norm_type
|
74 |
+
self.logging_interval = logging_interval
|
75 |
+
self.sub_module = sub_module
|
76 |
+
|
77 |
+
def on_after_backward(self, trainer: Trainer, model: LightningModule) -> None:
|
78 |
+
"""
|
79 |
+
Computes the gradient norm of the model parameters and logs it to the logger.
|
80 |
+
|
81 |
+
Args:
|
82 |
+
trainer (Trainer): The trainer object
|
83 |
+
model (LightningModule): The current lightningModule
|
84 |
+
"""
|
85 |
+
|
86 |
+
lightning_model = model
|
87 |
+
|
88 |
+
if self.sub_module is None:
|
89 |
+
return self.log_sub_module_grad_norm(lightning_model, model, "")
|
90 |
+
|
91 |
+
sub_modules = self.sub_module
|
92 |
+
if isinstance(sub_modules, str):
|
93 |
+
sub_modules = [sub_modules]
|
94 |
+
|
95 |
+
for sub_module in sub_modules:
|
96 |
+
self.log_sub_module_grad_norm(
|
97 |
+
lightning_model, getattr(model, sub_module), f"/{sub_module}"
|
98 |
+
)
|
99 |
+
|
100 |
+
def log_sub_module_grad_norm(
|
101 |
+
self, lightning_model: LightningModule, model: nn.Module, path: str
|
102 |
+
) -> None:
|
103 |
+
grad_norm_val = grad_norm(model.parameters(), self.norm_type)
|
104 |
+
if grad_norm_val is None:
|
105 |
+
return
|
106 |
+
|
107 |
+
on_step = self.logging_interval == "step"
|
108 |
+
lightning_model.log(
|
109 |
+
f"train{path}/grad_norm",
|
110 |
+
grad_norm_val,
|
111 |
+
on_step=on_step,
|
112 |
+
on_epoch=not on_step,
|
113 |
+
)
|
fish_speech/configs/base.yaml
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Base configuration for training a model
|
2 |
+
paths:
|
3 |
+
run_dir: results/${project}
|
4 |
+
ckpt_dir: ${paths.run_dir}/checkpoints
|
5 |
+
|
6 |
+
hydra:
|
7 |
+
run:
|
8 |
+
dir: ${paths.run_dir}
|
9 |
+
|
10 |
+
# Lightning Trainer
|
11 |
+
trainer:
|
12 |
+
_target_: lightning.pytorch.trainer.Trainer
|
13 |
+
|
14 |
+
default_root_dir: ${paths.run_dir}
|
15 |
+
accelerator: gpu
|
16 |
+
num_nodes: 1
|
17 |
+
devices: auto
|
18 |
+
strategy:
|
19 |
+
_target_: lightning.pytorch.strategies.DDPStrategy
|
20 |
+
process_group_backend: nccl # This should be override when training on windows
|
21 |
+
|
22 |
+
precision: bf16-mixed
|
23 |
+
|
24 |
+
# disable validation by epoch end
|
25 |
+
check_val_every_n_epoch: null
|
26 |
+
val_check_interval: 5000
|
27 |
+
max_steps: 100_000
|
28 |
+
|
29 |
+
# Use torch.backends.cudnn.benchmark to speed up training
|
30 |
+
benchmark: true
|
31 |
+
|
32 |
+
# Callbacks
|
33 |
+
callbacks:
|
34 |
+
model_checkpoint:
|
35 |
+
_target_: lightning.pytorch.callbacks.ModelCheckpoint
|
36 |
+
dirpath: ${paths.ckpt_dir}
|
37 |
+
filename: "step_{step:09d}"
|
38 |
+
save_last: false # additionally always save an exact copy of the last checkpoint to a file last.ckpt
|
39 |
+
save_top_k: 5 # save 5 latest checkpoints
|
40 |
+
monitor: step # use step to monitor checkpoints
|
41 |
+
mode: max # save the latest checkpoint with the highest global_step
|
42 |
+
every_n_epochs: null # don't save checkpoints by epoch end
|
43 |
+
every_n_train_steps: 5000 # save checkpoints every 5000 steps
|
44 |
+
auto_insert_metric_name: false
|
45 |
+
|
46 |
+
model_summary:
|
47 |
+
_target_: lightning.pytorch.callbacks.ModelSummary
|
48 |
+
max_depth: 2 # the maximum depth of layer nesting that the summary will include
|
49 |
+
|
50 |
+
learning_rate_monitor:
|
51 |
+
_target_: lightning.pytorch.callbacks.LearningRateMonitor
|
52 |
+
logging_interval: step
|
53 |
+
log_momentum: false
|
54 |
+
|
55 |
+
grad_norm_monitor:
|
56 |
+
_target_: fish_speech.callbacks.GradNormMonitor
|
57 |
+
norm_type: 2
|
58 |
+
logging_interval: step
|
59 |
+
|
60 |
+
# Logger
|
61 |
+
logger:
|
62 |
+
tensorboard:
|
63 |
+
_target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
|
64 |
+
save_dir: "${paths.run_dir}/tensorboard/"
|
65 |
+
name: null
|
66 |
+
log_graph: false
|
67 |
+
default_hp_metric: true
|
68 |
+
prefix: ""
|
69 |
+
|
70 |
+
# wandb:
|
71 |
+
# _target_: lightning.pytorch.loggers.wandb.WandbLogger
|
72 |
+
# # name: "" # name of the run (normally generated by wandb)
|
73 |
+
# save_dir: "${paths.run_dir}"
|
74 |
+
# offline: False
|
75 |
+
# id: null # pass correct id to resume experiment!
|
76 |
+
# anonymous: null # enable anonymous logging
|
77 |
+
# project: "fish-speech"
|
78 |
+
# log_model: False # upload lightning ckpts
|
79 |
+
# prefix: "" # a string to put at the beginning of metric keys
|
80 |
+
# # entity: "" # set to name of your wandb team
|
81 |
+
# group: ""
|
82 |
+
# tags: ["vq", "hq", "finetune"]
|
83 |
+
# job_type: ""
|
84 |
+
|
85 |
+
# Loop
|
86 |
+
train: true
|
87 |
+
test: false
|
fish_speech/configs/firefly_gan_vq.yaml
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
|
2 |
+
spec_transform:
|
3 |
+
_target_: fish_speech.utils.spectrogram.LogMelSpectrogram
|
4 |
+
sample_rate: 44100
|
5 |
+
n_mels: 160
|
6 |
+
n_fft: 2048
|
7 |
+
hop_length: 512
|
8 |
+
win_length: 2048
|
9 |
+
backbone:
|
10 |
+
_target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
|
11 |
+
input_channels: 160
|
12 |
+
depths: [3, 3, 9, 3]
|
13 |
+
dims: [128, 256, 384, 512]
|
14 |
+
drop_path_rate: 0.2
|
15 |
+
kernel_size: 7
|
16 |
+
head:
|
17 |
+
_target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
|
18 |
+
hop_length: 512
|
19 |
+
upsample_rates: [8, 8, 2, 2, 2] # aka. strides
|
20 |
+
upsample_kernel_sizes: [16, 16, 4, 4, 4]
|
21 |
+
resblock_kernel_sizes: [3, 7, 11]
|
22 |
+
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
23 |
+
num_mels: 512
|
24 |
+
upsample_initial_channel: 512
|
25 |
+
pre_conv_kernel_size: 13
|
26 |
+
post_conv_kernel_size: 13
|
27 |
+
quantizer:
|
28 |
+
_target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
|
29 |
+
input_dim: 512
|
30 |
+
n_groups: 8
|
31 |
+
n_codebooks: 1
|
32 |
+
levels: [8, 5, 5, 5]
|
33 |
+
downsample_factor: [2, 2]
|
fish_speech/configs/lora/r_8_alpha_16.yaml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_target_: fish_speech.models.text2semantic.lora.LoraConfig
|
2 |
+
r: 8
|
3 |
+
lora_alpha: 16
|
4 |
+
lora_dropout: 0.01
|
fish_speech/configs/text2semantic_finetune.yaml
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
defaults:
|
2 |
+
- base
|
3 |
+
- _self_
|
4 |
+
|
5 |
+
project: text2semantic_finetune_dual_ar
|
6 |
+
max_length: 4096
|
7 |
+
pretrained_ckpt_path: checkpoints/fish-speech-1.4
|
8 |
+
|
9 |
+
# Lightning Trainer
|
10 |
+
trainer:
|
11 |
+
accumulate_grad_batches: 1
|
12 |
+
gradient_clip_val: 1.0
|
13 |
+
gradient_clip_algorithm: "norm"
|
14 |
+
max_steps: 1000
|
15 |
+
precision: bf16-true
|
16 |
+
limit_val_batches: 10
|
17 |
+
val_check_interval: 100
|
18 |
+
|
19 |
+
# Dataset Configuration
|
20 |
+
tokenizer:
|
21 |
+
_target_: transformers.AutoTokenizer.from_pretrained
|
22 |
+
pretrained_model_name_or_path: ${pretrained_ckpt_path}
|
23 |
+
|
24 |
+
# Dataset Configuration
|
25 |
+
train_dataset:
|
26 |
+
_target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
|
27 |
+
proto_files:
|
28 |
+
- data/protos
|
29 |
+
tokenizer: ${tokenizer}
|
30 |
+
causal: true
|
31 |
+
max_length: ${max_length}
|
32 |
+
use_speaker: false
|
33 |
+
interactive_prob: 0.7
|
34 |
+
|
35 |
+
val_dataset:
|
36 |
+
_target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
|
37 |
+
proto_files:
|
38 |
+
- data/protos
|
39 |
+
tokenizer: ${tokenizer}
|
40 |
+
causal: true
|
41 |
+
max_length: ${max_length}
|
42 |
+
use_speaker: false
|
43 |
+
interactive_prob: 0.7
|
44 |
+
|
45 |
+
data:
|
46 |
+
_target_: fish_speech.datasets.semantic.SemanticDataModule
|
47 |
+
train_dataset: ${train_dataset}
|
48 |
+
val_dataset: ${val_dataset}
|
49 |
+
num_workers: 4
|
50 |
+
batch_size: 8
|
51 |
+
tokenizer: ${tokenizer}
|
52 |
+
max_length: ${max_length}
|
53 |
+
|
54 |
+
# Model Configuration
|
55 |
+
model:
|
56 |
+
_target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
|
57 |
+
model:
|
58 |
+
_target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
|
59 |
+
path: ${pretrained_ckpt_path}
|
60 |
+
load_weights: true
|
61 |
+
max_length: ${max_length}
|
62 |
+
lora_config: null
|
63 |
+
|
64 |
+
optimizer:
|
65 |
+
_target_: torch.optim.AdamW
|
66 |
+
_partial_: true
|
67 |
+
lr: 1e-4
|
68 |
+
weight_decay: 0
|
69 |
+
betas: [0.9, 0.95]
|
70 |
+
eps: 1e-5
|
71 |
+
|
72 |
+
lr_scheduler:
|
73 |
+
_target_: torch.optim.lr_scheduler.LambdaLR
|
74 |
+
_partial_: true
|
75 |
+
lr_lambda:
|
76 |
+
_target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
|
77 |
+
_partial_: true
|
78 |
+
num_warmup_steps: 10
|
79 |
+
|
80 |
+
# Callbacks
|
81 |
+
callbacks:
|
82 |
+
model_checkpoint:
|
83 |
+
every_n_train_steps: ${trainer.val_check_interval}
|
fish_speech/conversation.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
SEMANTIC_TOKEN = "<|semantic|>"
|
2 |
+
CODEBOOK_PAD_TOKEN_ID = 0
|
fish_speech/datasets/__pycache__/semantic.cpython-310.pyc
ADDED
Binary file (12.4 kB). View file
|
|
fish_speech/datasets/concat_repeat.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import bisect
|
2 |
+
import random
|
3 |
+
from typing import Iterable
|
4 |
+
|
5 |
+
from torch.utils.data import Dataset, IterableDataset
|
6 |
+
|
7 |
+
|
8 |
+
class ConcatRepeatDataset(Dataset):
|
9 |
+
datasets: list[Dataset]
|
10 |
+
cumulative_sizes: list[int]
|
11 |
+
repeats: list[int]
|
12 |
+
|
13 |
+
@staticmethod
|
14 |
+
def cumsum(sequence, repeats):
|
15 |
+
r, s = [], 0
|
16 |
+
for dataset, repeat in zip(sequence, repeats):
|
17 |
+
l = len(dataset) * repeat
|
18 |
+
r.append(l + s)
|
19 |
+
s += l
|
20 |
+
return r
|
21 |
+
|
22 |
+
def __init__(self, datasets: Iterable[Dataset], repeats: list[int]):
|
23 |
+
super().__init__()
|
24 |
+
|
25 |
+
self.datasets = list(datasets)
|
26 |
+
self.repeats = repeats
|
27 |
+
|
28 |
+
assert len(self.datasets) > 0, "datasets should not be an empty iterable"
|
29 |
+
assert len(self.datasets) == len(
|
30 |
+
repeats
|
31 |
+
), "datasets and repeats should have the same length"
|
32 |
+
|
33 |
+
for d in self.datasets:
|
34 |
+
assert not isinstance(
|
35 |
+
d, IterableDataset
|
36 |
+
), "ConcatRepeatDataset does not support IterableDataset"
|
37 |
+
|
38 |
+
self.cumulative_sizes = self.cumsum(self.datasets, self.repeats)
|
39 |
+
|
40 |
+
def __len__(self):
|
41 |
+
return self.cumulative_sizes[-1]
|
42 |
+
|
43 |
+
def __getitem__(self, idx):
|
44 |
+
dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
|
45 |
+
|
46 |
+
if dataset_idx == 0:
|
47 |
+
sample_idx = idx
|
48 |
+
else:
|
49 |
+
sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
|
50 |
+
|
51 |
+
dataset = self.datasets[dataset_idx]
|
52 |
+
|
53 |
+
return dataset[sample_idx % len(dataset)]
|
fish_speech/datasets/protos/__pycache__/text_data_pb2.cpython-310.pyc
ADDED
Binary file (1.26 kB). View file
|
|
fish_speech/datasets/protos/__pycache__/text_data_stream.cpython-310.pyc
ADDED
Binary file (1.13 kB). View file
|
|
fish_speech/datasets/protos/text-data.proto
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
syntax = "proto3";
|
2 |
+
|
3 |
+
package text_data;
|
4 |
+
|
5 |
+
message Semantics {
|
6 |
+
repeated uint32 values = 1;
|
7 |
+
}
|
8 |
+
|
9 |
+
message Sentence {
|
10 |
+
repeated string texts = 1;
|
11 |
+
repeated Semantics semantics = 3;
|
12 |
+
}
|
13 |
+
|
14 |
+
message TextData {
|
15 |
+
string source = 1;
|
16 |
+
string name = 2;
|
17 |
+
repeated Sentence sentences = 4;
|
18 |
+
}
|
19 |
+
|
20 |
+
message SampledData {
|
21 |
+
string source = 1;
|
22 |
+
string name = 2;
|
23 |
+
repeated Sentence samples = 3;
|
24 |
+
}
|
fish_speech/datasets/protos/text_data_pb2.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
3 |
+
# source: text-data.proto
|
4 |
+
# Protobuf Python Version: 4.25.1
|
5 |
+
"""Generated protocol buffer code."""
|
6 |
+
from google.protobuf import descriptor as _descriptor
|
7 |
+
from google.protobuf import descriptor_pool as _descriptor_pool
|
8 |
+
from google.protobuf import symbol_database as _symbol_database
|
9 |
+
from google.protobuf.internal import builder as _builder
|
10 |
+
|
11 |
+
# @@protoc_insertion_point(imports)
|
12 |
+
|
13 |
+
_sym_db = _symbol_database.Default()
|
14 |
+
|
15 |
+
|
16 |
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
17 |
+
b'\n\x0ftext-data.proto\x12\ttext_data"\x1b\n\tSemantics\x12\x0e\n\x06values\x18\x01 \x03(\r"B\n\x08Sentence\x12\r\n\x05texts\x18\x01 \x03(\t\x12\'\n\tsemantics\x18\x03 \x03(\x0b\x32\x14.text_data.Semantics"P\n\x08TextData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12&\n\tsentences\x18\x04 \x03(\x0b\x32\x13.text_data.Sentence"Q\n\x0bSampledData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12$\n\x07samples\x18\x03 \x03(\x0b\x32\x13.text_data.Sentenceb\x06proto3'
|
18 |
+
)
|
19 |
+
|
20 |
+
_globals = globals()
|
21 |
+
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
22 |
+
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "text_data_pb2", _globals)
|
23 |
+
if _descriptor._USE_C_DESCRIPTORS == False:
|
24 |
+
DESCRIPTOR._options = None
|
25 |
+
_globals["_SEMANTICS"]._serialized_start = 30
|
26 |
+
_globals["_SEMANTICS"]._serialized_end = 57
|
27 |
+
_globals["_SENTENCE"]._serialized_start = 59
|
28 |
+
_globals["_SENTENCE"]._serialized_end = 125
|
29 |
+
_globals["_TEXTDATA"]._serialized_start = 127
|
30 |
+
_globals["_TEXTDATA"]._serialized_end = 207
|
31 |
+
_globals["_SAMPLEDDATA"]._serialized_start = 209
|
32 |
+
_globals["_SAMPLEDDATA"]._serialized_end = 290
|
33 |
+
# @@protoc_insertion_point(module_scope)
|
fish_speech/datasets/protos/text_data_stream.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import struct
|
2 |
+
|
3 |
+
from .text_data_pb2 import TextData
|
4 |
+
|
5 |
+
|
6 |
+
def read_pb_stream(f):
|
7 |
+
while True:
|
8 |
+
buf = f.read(4)
|
9 |
+
if len(buf) == 0:
|
10 |
+
break
|
11 |
+
size = struct.unpack("I", buf)[0]
|
12 |
+
buf = f.read(size)
|
13 |
+
text_data = TextData()
|
14 |
+
text_data.ParseFromString(buf)
|
15 |
+
yield text_data
|
16 |
+
|
17 |
+
|
18 |
+
def write_pb_stream(f, text_data):
|
19 |
+
buf = text_data.SerializeToString()
|
20 |
+
f.write(struct.pack("I", len(buf)))
|
21 |
+
f.write(buf)
|
22 |
+
|
23 |
+
|
24 |
+
def pack_pb_stream(text_data):
|
25 |
+
buf = text_data.SerializeToString()
|
26 |
+
return struct.pack("I", len(buf)) + buf
|
27 |
+
|
28 |
+
|
29 |
+
def split_pb_stream(f):
|
30 |
+
while True:
|
31 |
+
head = f.read(4)
|
32 |
+
if len(head) == 0:
|
33 |
+
break
|
34 |
+
size = struct.unpack("I", head)[0]
|
35 |
+
buf = f.read(size)
|
36 |
+
yield head + buf
|
fish_speech/datasets/semantic.py
ADDED
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
from dataclasses import dataclass
|
3 |
+
from itertools import chain
|
4 |
+
from pathlib import Path
|
5 |
+
from random import Random
|
6 |
+
from typing import Optional, Union
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
import pyarrow.parquet as pq
|
10 |
+
import torch
|
11 |
+
import torch.nn.functional as F
|
12 |
+
from datasets.download.streaming_download_manager import xopen
|
13 |
+
from huggingface_hub import HfApi
|
14 |
+
from lightning import LightningDataModule
|
15 |
+
from torch.distributed import get_rank, get_world_size, is_initialized
|
16 |
+
from torch.utils.data import DataLoader, IterableDataset, get_worker_info
|
17 |
+
from transformers import AutoTokenizer
|
18 |
+
|
19 |
+
from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
|
20 |
+
from fish_speech.datasets.protos.text_data_pb2 import SampledData
|
21 |
+
from fish_speech.datasets.protos.text_data_stream import read_pb_stream
|
22 |
+
from fish_speech.text.clean import clean_text
|
23 |
+
from fish_speech.utils import RankedLogger
|
24 |
+
from fish_speech.utils.braceexpand import braceexpand
|
25 |
+
|
26 |
+
log = RankedLogger(__name__, rank_zero_only=True)
|
27 |
+
|
28 |
+
|
29 |
+
def split_by_rank_worker(files):
|
30 |
+
# We need to know the total number of devices
|
31 |
+
# to split the data properly
|
32 |
+
|
33 |
+
total_devices = 1
|
34 |
+
if is_initialized():
|
35 |
+
total_devices = get_world_size()
|
36 |
+
|
37 |
+
worker_info = get_worker_info()
|
38 |
+
if worker_info is not None:
|
39 |
+
total_devices *= worker_info.num_workers
|
40 |
+
|
41 |
+
if len(files) < total_devices:
|
42 |
+
# Repeat the files N times to match the number of devices
|
43 |
+
files = files * (total_devices // len(files) + 1)
|
44 |
+
|
45 |
+
# DDP
|
46 |
+
if is_initialized():
|
47 |
+
files = files[get_rank() :: get_world_size()]
|
48 |
+
|
49 |
+
# Split by worker
|
50 |
+
if worker_info is not None:
|
51 |
+
files = files[worker_info.id :: worker_info.num_workers]
|
52 |
+
|
53 |
+
return files
|
54 |
+
|
55 |
+
|
56 |
+
class AutoTextSemanticInstructionDataset(IterableDataset):
|
57 |
+
"""
|
58 |
+
Auto Augment Dataset by Speaker
|
59 |
+
|
60 |
+
1. Random concatenate multiple sentences from the same speaker to form a longer sentence
|
61 |
+
2. Automatically normalize the text
|
62 |
+
|
63 |
+
For interactive mode, we use the following format (multiple sequences):
|
64 |
+
<s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>
|
65 |
+
|
66 |
+
For non-interactive mode, we use the following format (one long sequence):
|
67 |
+
<s> [INST] text [/INST] ... </s>
|
68 |
+
"""
|
69 |
+
|
70 |
+
def __init__(
|
71 |
+
self,
|
72 |
+
proto_files: list[str],
|
73 |
+
seed: int = 42,
|
74 |
+
interactive_prob: float = 0.5,
|
75 |
+
max_length: int = 1024,
|
76 |
+
tokenizer: AutoTokenizer = None,
|
77 |
+
use_speaker: bool | float = True,
|
78 |
+
causal: bool = True,
|
79 |
+
num_codebooks: Optional[int] = None,
|
80 |
+
skip_text_prob: float = 0.0,
|
81 |
+
):
|
82 |
+
"""
|
83 |
+
Args:
|
84 |
+
proto_files: proto buf files if using local data
|
85 |
+
seed: random seed
|
86 |
+
interactive_prob: probability to use interactive mode
|
87 |
+
max_length: max length of the text
|
88 |
+
tokenizer: tokenizer
|
89 |
+
use_speaker: include speaker information in the prompt
|
90 |
+
causal: use causal sampling when using local data, disable will lead to random sampling
|
91 |
+
num_codebooks: number of codebooks, if None, it will be automatically detected
|
92 |
+
skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode
|
93 |
+
"""
|
94 |
+
|
95 |
+
super().__init__()
|
96 |
+
|
97 |
+
assert 0 <= interactive_prob <= 1, "interactive_prob must be in [0, 1]"
|
98 |
+
|
99 |
+
self.seed = seed
|
100 |
+
self.max_length = max_length
|
101 |
+
self.tokenizer = tokenizer
|
102 |
+
self.interactive_prob = interactive_prob
|
103 |
+
self.use_speaker = use_speaker
|
104 |
+
self.proto_files = proto_files
|
105 |
+
self.causal = causal
|
106 |
+
self.num_codebooks = num_codebooks
|
107 |
+
self.skip_text_prob = skip_text_prob
|
108 |
+
|
109 |
+
self.semantic_token_id = self.tokenizer.convert_tokens_to_ids("<|semantic|>")
|
110 |
+
self.groups = None
|
111 |
+
|
112 |
+
def init_mock_data_server(self):
|
113 |
+
if self.groups is not None:
|
114 |
+
return
|
115 |
+
|
116 |
+
# Expand the proto files
|
117 |
+
expanded_proto_files = []
|
118 |
+
for filename in self.proto_files:
|
119 |
+
for i in braceexpand(filename):
|
120 |
+
i = Path(i)
|
121 |
+
if i.is_file():
|
122 |
+
expanded_proto_files.append(i)
|
123 |
+
elif i.is_dir():
|
124 |
+
expanded_proto_files.extend(i.rglob("*.proto"))
|
125 |
+
expanded_proto_files.extend(i.rglob("*.protos"))
|
126 |
+
else:
|
127 |
+
raise ValueError(f"{i} is not a file or directory")
|
128 |
+
|
129 |
+
expanded_proto_files = sorted(expanded_proto_files)
|
130 |
+
Random(self.seed).shuffle(expanded_proto_files)
|
131 |
+
|
132 |
+
self.groups = []
|
133 |
+
shard_proto_files = split_by_rank_worker(expanded_proto_files)
|
134 |
+
log.info(
|
135 |
+
f"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files"
|
136 |
+
)
|
137 |
+
|
138 |
+
count = 0
|
139 |
+
for filename in shard_proto_files:
|
140 |
+
with open(filename, "rb") as f:
|
141 |
+
for text_data in read_pb_stream(f):
|
142 |
+
self.groups.append(text_data)
|
143 |
+
count += 1
|
144 |
+
|
145 |
+
log.info(f"Read total {count} groups of data")
|
146 |
+
|
147 |
+
# Shuffle the lines
|
148 |
+
Random(self.seed).shuffle(self.groups)
|
149 |
+
self.group_weights = [len(i.sentences) for i in self.groups]
|
150 |
+
|
151 |
+
def __iter__(self):
|
152 |
+
while True:
|
153 |
+
yield self.augment()
|
154 |
+
|
155 |
+
def tokenize_sentence(self, sentence: str):
|
156 |
+
sentence = clean_text(sentence)
|
157 |
+
tokens = self.tokenizer.encode(
|
158 |
+
f"{sentence}",
|
159 |
+
max_length=10**6,
|
160 |
+
add_special_tokens=False,
|
161 |
+
truncation=False,
|
162 |
+
)
|
163 |
+
return sentence, len(tokens)
|
164 |
+
|
165 |
+
def sample_data(self):
|
166 |
+
if self.groups is None:
|
167 |
+
self.init_mock_data_server()
|
168 |
+
|
169 |
+
# Shuffle unique lines, estimate that each sample is at least 20 tokens
|
170 |
+
num_samples = self.max_length // 20
|
171 |
+
|
172 |
+
# choice group based on their number of samples
|
173 |
+
group = random.choices(self.groups, weights=self.group_weights, k=1)[0]
|
174 |
+
|
175 |
+
if self.causal:
|
176 |
+
# Sample in order
|
177 |
+
if num_samples >= len(group.sentences):
|
178 |
+
samples = group.sentences
|
179 |
+
else:
|
180 |
+
begin = random.randint(0, len(group.sentences) - num_samples)
|
181 |
+
samples = group.sentences[begin : begin + num_samples]
|
182 |
+
else:
|
183 |
+
samples = random.choices(
|
184 |
+
group.sentences, k=min(num_samples, len(group.sentences))
|
185 |
+
)
|
186 |
+
|
187 |
+
return SampledData(
|
188 |
+
source=group.source,
|
189 |
+
name=group.name,
|
190 |
+
samples=samples,
|
191 |
+
)
|
192 |
+
|
193 |
+
def augment(self):
|
194 |
+
final_text, final_semantic = [], []
|
195 |
+
response = self.sample_data()
|
196 |
+
if len(response.samples) == 0:
|
197 |
+
# Invalid group
|
198 |
+
return None
|
199 |
+
|
200 |
+
samples = list(response.samples)
|
201 |
+
idx = 0
|
202 |
+
use_interactive = random.random() < self.interactive_prob
|
203 |
+
|
204 |
+
if use_interactive is False:
|
205 |
+
# Random sample based on speaker using a truncated normal distribution
|
206 |
+
a = torch.tensor([0], dtype=torch.float32)
|
207 |
+
torch.nn.init.trunc_normal_(
|
208 |
+
a,
|
209 |
+
mean=self.max_length // 2,
|
210 |
+
std=self.max_length // 4,
|
211 |
+
a=10,
|
212 |
+
b=self.max_length,
|
213 |
+
)
|
214 |
+
remaining_tokens = a.long().item() - 4
|
215 |
+
else:
|
216 |
+
remaining_tokens = self.max_length
|
217 |
+
|
218 |
+
# Use speaker
|
219 |
+
if isinstance(self.use_speaker, float):
|
220 |
+
use_speaker = random.random() < self.use_speaker
|
221 |
+
else:
|
222 |
+
use_speaker = self.use_speaker
|
223 |
+
|
224 |
+
all_tokens, all_labels = [], []
|
225 |
+
while remaining_tokens > 0 and len(samples) > 0:
|
226 |
+
sentence = samples.pop(0)
|
227 |
+
|
228 |
+
text = random.choice(sentence.texts)
|
229 |
+
text, length = self.tokenize_sentence(text)
|
230 |
+
remaining_tokens -= length + len(sentence.semantics[0].values)
|
231 |
+
|
232 |
+
if use_interactive is False:
|
233 |
+
final_text.append(text)
|
234 |
+
final_semantic.append(sentence.semantics)
|
235 |
+
else:
|
236 |
+
# For interactive mode, we only apply speaker for the first sentence
|
237 |
+
# [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST]
|
238 |
+
tokens, labels = self.pack_sentences(
|
239 |
+
sentences=[text],
|
240 |
+
semantics=[sentence.semantics],
|
241 |
+
speaker=response.name if use_speaker else None,
|
242 |
+
skip_text=random.random() < self.skip_text_prob,
|
243 |
+
)
|
244 |
+
|
245 |
+
all_tokens.append(tokens)
|
246 |
+
all_labels.append(labels)
|
247 |
+
|
248 |
+
idx += 1
|
249 |
+
|
250 |
+
if use_interactive is False:
|
251 |
+
tokens, labels = self.pack_sentences(
|
252 |
+
final_text,
|
253 |
+
semantics=final_semantic,
|
254 |
+
speaker=response.name if use_speaker else None,
|
255 |
+
)
|
256 |
+
all_tokens.append(tokens)
|
257 |
+
all_labels.append(labels)
|
258 |
+
|
259 |
+
tokens = torch.cat(all_tokens, dim=1)
|
260 |
+
labels = torch.cat(all_labels, dim=1)
|
261 |
+
|
262 |
+
# Verify that the length is correct
|
263 |
+
assert tokens.size(1) == labels.size(1), f"{tokens.size(1)} != {labels.size(1)}"
|
264 |
+
|
265 |
+
data = {"tokens": tokens, "labels": labels}
|
266 |
+
|
267 |
+
return data
|
268 |
+
|
269 |
+
def pack_sentences(
|
270 |
+
self,
|
271 |
+
sentences: list[str],
|
272 |
+
semantics: list,
|
273 |
+
speaker: Optional[str] = None,
|
274 |
+
skip_text: bool = False,
|
275 |
+
):
|
276 |
+
if speaker is None:
|
277 |
+
speaker = "assistant"
|
278 |
+
|
279 |
+
cated_sentences = " ".join(sentences)
|
280 |
+
if skip_text:
|
281 |
+
cated_sentences = "<|skip_text|>"
|
282 |
+
|
283 |
+
final_text = "<|im_start|>user\n" + cated_sentences + "<|im_end|>"
|
284 |
+
final_text = final_text + f"<|im_start|>{speaker}\n"
|
285 |
+
|
286 |
+
encoded = self.tokenizer.encode(
|
287 |
+
final_text,
|
288 |
+
add_special_tokens=False,
|
289 |
+
truncation=False,
|
290 |
+
max_length=10**6,
|
291 |
+
)
|
292 |
+
semantic_length = sum([len(i[0].values) for i in semantics])
|
293 |
+
prompt_length = len(encoded)
|
294 |
+
num_codebooks = (
|
295 |
+
len(semantics[0]) if self.num_codebooks is None else self.num_codebooks
|
296 |
+
)
|
297 |
+
|
298 |
+
# Pack the tokens and semantics (add <s> and </s> to semantic tokens)
|
299 |
+
tokens = (
|
300 |
+
encoded
|
301 |
+
+ [self.semantic_token_id] * semantic_length
|
302 |
+
+ self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])
|
303 |
+
)
|
304 |
+
|
305 |
+
# Codebook bos/padding: 0, eos: 1
|
306 |
+
codes = [[CODEBOOK_PAD_TOKEN_ID] * prompt_length for _ in range(num_codebooks)]
|
307 |
+
for segment in semantics:
|
308 |
+
for book_idx, book in zip(range(num_codebooks), segment):
|
309 |
+
for j in book.values:
|
310 |
+
codes[book_idx].append(int(j) + 1)
|
311 |
+
|
312 |
+
for book in codes:
|
313 |
+
book.extend([CODEBOOK_PAD_TOKEN_ID] * 1)
|
314 |
+
|
315 |
+
tokens = [tokens] + codes
|
316 |
+
|
317 |
+
tokens = torch.tensor(tokens, dtype=torch.long)
|
318 |
+
labels = tokens.clone()
|
319 |
+
|
320 |
+
if skip_text:
|
321 |
+
# If text is not provided, the sentence is used for condition only, all labels are -100
|
322 |
+
torch.fill_(labels, -100)
|
323 |
+
return tokens, labels
|
324 |
+
|
325 |
+
# Mask out the <s> tokens for semantic, predict semantic tokens only
|
326 |
+
# Since we don't mask out the input tokens, the language modeling still works
|
327 |
+
labels[1:, :prompt_length] = -100
|
328 |
+
|
329 |
+
tokens = tokens[:, :-1]
|
330 |
+
labels = labels[:, 1:]
|
331 |
+
|
332 |
+
# Verify the padding is correct, and the last token is eos
|
333 |
+
assert (tokens[1:, :prompt_length] == CODEBOOK_PAD_TOKEN_ID).all()
|
334 |
+
assert (labels[1:, -1:] == CODEBOOK_PAD_TOKEN_ID).all()
|
335 |
+
|
336 |
+
return tokens, labels
|
337 |
+
|
338 |
+
|
339 |
+
@dataclass
|
340 |
+
class TextDataCollator:
|
341 |
+
tokenizer: AutoTokenizer
|
342 |
+
max_length: int = 1024
|
343 |
+
|
344 |
+
def __call__(self, examples):
|
345 |
+
if "negative_tokens" in examples:
|
346 |
+
positive_examples = []
|
347 |
+
negative_examples = []
|
348 |
+
|
349 |
+
for i in examples:
|
350 |
+
positive_examples.append(
|
351 |
+
{
|
352 |
+
"tokens": i["tokens"],
|
353 |
+
"labels": i["labels"],
|
354 |
+
}
|
355 |
+
)
|
356 |
+
negative_examples.append(
|
357 |
+
{
|
358 |
+
"tokens": i["negative_tokens"],
|
359 |
+
"labels": i["negative_labels"],
|
360 |
+
}
|
361 |
+
)
|
362 |
+
|
363 |
+
examples = positive_examples + negative_examples
|
364 |
+
|
365 |
+
return self.batchify(examples)
|
366 |
+
|
367 |
+
def batchify(self, examples, tokens_key="tokens", labels_key="labels"):
|
368 |
+
tokens, attention_masks, labels = [], [], []
|
369 |
+
|
370 |
+
# Calculate the max length
|
371 |
+
max_tokens_length = 0
|
372 |
+
for example in examples:
|
373 |
+
max_tokens_length = max(max_tokens_length, example[tokens_key].size(1))
|
374 |
+
max_tokens_length = min(max_tokens_length, self.max_length)
|
375 |
+
|
376 |
+
for example in examples:
|
377 |
+
_tokens = example[tokens_key][:, :max_tokens_length]
|
378 |
+
_labels = example[labels_key][:, :max_tokens_length]
|
379 |
+
_attention_mask = torch.ones((max_tokens_length,), dtype=torch.bool)
|
380 |
+
tokens_length = _tokens.size(1)
|
381 |
+
_attention_mask[:tokens_length] = False
|
382 |
+
|
383 |
+
assert tokens_length == _labels.size(
|
384 |
+
1
|
385 |
+
), f"{tokens_length} != {_labels.size(1)}"
|
386 |
+
|
387 |
+
if tokens_length < max_tokens_length:
|
388 |
+
_tokens = F.pad(
|
389 |
+
_tokens,
|
390 |
+
(0, max_tokens_length - tokens_length),
|
391 |
+
value=self.tokenizer.eos_token_id,
|
392 |
+
)
|
393 |
+
_tokens[1:, tokens_length:] = CODEBOOK_PAD_TOKEN_ID
|
394 |
+
_labels = F.pad(
|
395 |
+
_labels, (0, max_tokens_length - _labels.size(1)), value=-100
|
396 |
+
)
|
397 |
+
|
398 |
+
tokens.append(_tokens)
|
399 |
+
attention_masks.append(_attention_mask)
|
400 |
+
labels.append(_labels)
|
401 |
+
|
402 |
+
tokens = torch.stack(tokens, dim=0)
|
403 |
+
attention_masks = torch.stack(attention_masks, dim=0)
|
404 |
+
labels = torch.stack(labels, dim=0)
|
405 |
+
|
406 |
+
return {
|
407 |
+
"inputs": tokens,
|
408 |
+
"attention_masks": attention_masks,
|
409 |
+
"labels": labels,
|
410 |
+
}
|
411 |
+
|
412 |
+
|
413 |
+
class InterleaveDataset(IterableDataset):
|
414 |
+
def __init__(
|
415 |
+
self,
|
416 |
+
datasets: list[IterableDataset],
|
417 |
+
probabilities: list[float],
|
418 |
+
seed: int = 42,
|
419 |
+
):
|
420 |
+
super().__init__()
|
421 |
+
|
422 |
+
self.datasets = datasets
|
423 |
+
self.probabilities = probabilities
|
424 |
+
self.seed = seed
|
425 |
+
|
426 |
+
def __iter__(self):
|
427 |
+
rng = np.random.default_rng(self.seed)
|
428 |
+
dataset_iterators = [iter(dataset) for dataset in self.datasets]
|
429 |
+
|
430 |
+
while True:
|
431 |
+
# Random choice one
|
432 |
+
dataset_idx = rng.choice(len(self.datasets), p=self.probabilities)
|
433 |
+
dataset_iterator = dataset_iterators[dataset_idx]
|
434 |
+
|
435 |
+
try:
|
436 |
+
yield next(dataset_iterator)
|
437 |
+
except StopIteration:
|
438 |
+
# Exhausted, create a new iterator
|
439 |
+
dataset_iterators[dataset_idx] = iter(self.datasets[dataset_idx])
|
440 |
+
yield next(dataset_iterators[dataset_idx])
|
441 |
+
|
442 |
+
|
443 |
+
class SemanticDataModule(LightningDataModule):
|
444 |
+
def __init__(
|
445 |
+
self,
|
446 |
+
train_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
|
447 |
+
val_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
|
448 |
+
batch_size: int = 32,
|
449 |
+
tokenizer: AutoTokenizer = None,
|
450 |
+
max_length: int = 1024,
|
451 |
+
num_workers: int = 4,
|
452 |
+
):
|
453 |
+
super().__init__()
|
454 |
+
|
455 |
+
self.train_dataset = train_dataset
|
456 |
+
self.val_dataset = val_dataset
|
457 |
+
self.batch_size = batch_size
|
458 |
+
self.tokenizer = tokenizer
|
459 |
+
self.max_length = max_length
|
460 |
+
self.num_workers = num_workers
|
461 |
+
|
462 |
+
def train_dataloader(self):
|
463 |
+
return DataLoader(
|
464 |
+
self.train_dataset,
|
465 |
+
batch_size=self.batch_size,
|
466 |
+
collate_fn=TextDataCollator(self.tokenizer, self.max_length),
|
467 |
+
num_workers=self.num_workers,
|
468 |
+
persistent_workers=True,
|
469 |
+
)
|
470 |
+
|
471 |
+
def val_dataloader(self):
|
472 |
+
return DataLoader(
|
473 |
+
self.val_dataset,
|
474 |
+
batch_size=self.batch_size,
|
475 |
+
collate_fn=TextDataCollator(self.tokenizer, self.max_length),
|
476 |
+
num_workers=self.num_workers,
|
477 |
+
persistent_workers=True,
|
478 |
+
)
|
479 |
+
|
480 |
+
|
481 |
+
if __name__ == "__main__":
|
482 |
+
from tqdm import tqdm
|
483 |
+
|
484 |
+
ds = AutoTextSemanticInstructionDataset(
|
485 |
+
["data/protos"],
|
486 |
+
tokenizer=AutoTokenizer.from_pretrained("fishaudio/fish-speech-1"),
|
487 |
+
use_speaker=False,
|
488 |
+
interactive_prob=1.0,
|
489 |
+
skip_text_prob=0.5,
|
490 |
+
)
|
491 |
+
|
492 |
+
for i in ds:
|
493 |
+
print(ds.tokenizer.decode(i["tokens"][0], skip_special_tokens=False))
|
494 |
+
# i["labels"][0][i["labels"][0] == -100] = 0
|
495 |
+
# print(ds.tokenizer.decode(i["labels"][0], skip_special_tokens=False))
|
496 |
+
break
|
fish_speech/datasets/vqgan.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
import librosa
|
6 |
+
import numpy as np
|
7 |
+
import torch
|
8 |
+
from lightning import LightningDataModule
|
9 |
+
from torch.utils.data import DataLoader, Dataset
|
10 |
+
|
11 |
+
from fish_speech.utils import RankedLogger
|
12 |
+
|
13 |
+
logger = RankedLogger(__name__, rank_zero_only=False)
|
14 |
+
|
15 |
+
|
16 |
+
class VQGANDataset(Dataset):
|
17 |
+
def __init__(
|
18 |
+
self,
|
19 |
+
filelist: str,
|
20 |
+
sample_rate: int = 32000,
|
21 |
+
hop_length: int = 640,
|
22 |
+
slice_frames: Optional[int] = None,
|
23 |
+
):
|
24 |
+
super().__init__()
|
25 |
+
|
26 |
+
filelist = Path(filelist)
|
27 |
+
root = filelist.parent
|
28 |
+
|
29 |
+
self.files = [
|
30 |
+
root / line.strip()
|
31 |
+
for line in filelist.read_text(encoding="utf-8").splitlines()
|
32 |
+
if line.strip()
|
33 |
+
]
|
34 |
+
self.sample_rate = sample_rate
|
35 |
+
self.hop_length = hop_length
|
36 |
+
self.slice_frames = slice_frames
|
37 |
+
|
38 |
+
def __len__(self):
|
39 |
+
return len(self.files)
|
40 |
+
|
41 |
+
def get_item(self, idx):
|
42 |
+
file = self.files[idx]
|
43 |
+
|
44 |
+
audio, _ = librosa.load(file, sr=self.sample_rate, mono=True)
|
45 |
+
|
46 |
+
# Slice audio and features
|
47 |
+
if (
|
48 |
+
self.slice_frames is not None
|
49 |
+
and audio.shape[0] > self.slice_frames * self.hop_length
|
50 |
+
):
|
51 |
+
start = np.random.randint(
|
52 |
+
0, audio.shape[0] - self.slice_frames * self.hop_length
|
53 |
+
)
|
54 |
+
audio = audio[start : start + self.slice_frames * self.hop_length]
|
55 |
+
|
56 |
+
if len(audio) == 0:
|
57 |
+
return None
|
58 |
+
|
59 |
+
max_value = np.abs(audio).max()
|
60 |
+
if max_value > 1.0:
|
61 |
+
audio = audio / max_value
|
62 |
+
|
63 |
+
return {
|
64 |
+
"audio": torch.from_numpy(audio),
|
65 |
+
}
|
66 |
+
|
67 |
+
def __getitem__(self, idx):
|
68 |
+
try:
|
69 |
+
return self.get_item(idx)
|
70 |
+
except Exception as e:
|
71 |
+
import traceback
|
72 |
+
|
73 |
+
traceback.print_exc()
|
74 |
+
logger.error(f"Error loading {self.files[idx]}: {e}")
|
75 |
+
return None
|
76 |
+
|
77 |
+
|
78 |
+
@dataclass
|
79 |
+
class VQGANCollator:
|
80 |
+
def __call__(self, batch):
|
81 |
+
batch = [x for x in batch if x is not None]
|
82 |
+
|
83 |
+
audio_lengths = torch.tensor([len(x["audio"]) for x in batch])
|
84 |
+
audio_maxlen = audio_lengths.max()
|
85 |
+
|
86 |
+
# Rounds up to nearest multiple of 2 (audio_lengths)
|
87 |
+
audios = []
|
88 |
+
for x in batch:
|
89 |
+
audios.append(
|
90 |
+
torch.nn.functional.pad(x["audio"], (0, audio_maxlen - len(x["audio"])))
|
91 |
+
)
|
92 |
+
|
93 |
+
return {
|
94 |
+
"audios": torch.stack(audios),
|
95 |
+
"audio_lengths": audio_lengths,
|
96 |
+
}
|
97 |
+
|
98 |
+
|
99 |
+
class VQGANDataModule(LightningDataModule):
|
100 |
+
def __init__(
|
101 |
+
self,
|
102 |
+
train_dataset: VQGANDataset,
|
103 |
+
val_dataset: VQGANDataset,
|
104 |
+
batch_size: int = 32,
|
105 |
+
num_workers: int = 4,
|
106 |
+
val_batch_size: Optional[int] = None,
|
107 |
+
):
|
108 |
+
super().__init__()
|
109 |
+
|
110 |
+
self.train_dataset = train_dataset
|
111 |
+
self.val_dataset = val_dataset
|
112 |
+
self.batch_size = batch_size
|
113 |
+
self.val_batch_size = val_batch_size or batch_size
|
114 |
+
self.num_workers = num_workers
|
115 |
+
|
116 |
+
def train_dataloader(self):
|
117 |
+
return DataLoader(
|
118 |
+
self.train_dataset,
|
119 |
+
batch_size=self.batch_size,
|
120 |
+
collate_fn=VQGANCollator(),
|
121 |
+
num_workers=self.num_workers,
|
122 |
+
shuffle=True,
|
123 |
+
persistent_workers=True,
|
124 |
+
)
|
125 |
+
|
126 |
+
def val_dataloader(self):
|
127 |
+
return DataLoader(
|
128 |
+
self.val_dataset,
|
129 |
+
batch_size=self.val_batch_size,
|
130 |
+
collate_fn=VQGANCollator(),
|
131 |
+
num_workers=self.num_workers,
|
132 |
+
persistent_workers=True,
|
133 |
+
)
|
134 |
+
|
135 |
+
|
136 |
+
if __name__ == "__main__":
|
137 |
+
dataset = VQGANDataset("data/LibriTTS_R/vq_train_filelist.txt")
|
138 |
+
dataloader = DataLoader(
|
139 |
+
dataset, batch_size=4, shuffle=False, collate_fn=VQGANCollator()
|
140 |
+
)
|
141 |
+
|
142 |
+
for batch in dataloader:
|
143 |
+
print(batch["audios"].shape)
|
144 |
+
print(batch["features"].shape)
|
145 |
+
print(batch["audio_lengths"])
|
146 |
+
print(batch["feature_lengths"])
|
147 |
+
break
|
fish_speech/i18n/README.md
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## i18n Folder Attribution
|
2 |
+
|
3 |
+
The `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below:
|
4 |
+
|
5 |
+
### fish_speech/i18n/core.py
|
6 |
+
|
7 |
+
**Related code from RVC:**
|
8 |
+
[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py)
|
9 |
+
|
10 |
+
**Initial commit:**
|
11 |
+
add localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#35](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/35)
|
12 |
+
|
13 |
+
**Initial author:**
|
14 |
+
[@L4Ph](https://github.com/L4Ph)
|
15 |
+
|
16 |
+
### fish_speech/i18n/scan.py
|
17 |
+
|
18 |
+
**Related code from RVC:**
|
19 |
+
[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py)
|
20 |
+
|
21 |
+
**Initial commit:**
|
22 |
+
File for detecting i18n missing keys [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#1058](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/1058)
|
23 |
+
|
24 |
+
**Initial author:**
|
25 |
+
[@towzeur](https://github.com/towzeur)
|
26 |
+
|
27 |
+
We appreciate the contributions of the RVC project and its authors.
|
fish_speech/i18n/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .core import i18n
|
2 |
+
|
3 |
+
__all__ = ["i18n"]
|
fish_speech/i18n/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (218 Bytes). View file
|
|
fish_speech/i18n/__pycache__/core.cpython-310.pyc
ADDED
Binary file (1.44 kB). View file
|
|
fish_speech/i18n/core.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import locale
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
I18N_FILE_PATH = Path(__file__).parent / "locale"
|
6 |
+
DEFAULT_LANGUAGE = "en_US"
|
7 |
+
|
8 |
+
|
9 |
+
def load_language_list(language):
|
10 |
+
with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f:
|
11 |
+
language_list = json.load(f)
|
12 |
+
|
13 |
+
return language_list
|
14 |
+
|
15 |
+
|
16 |
+
class I18nAuto:
|
17 |
+
def __init__(self):
|
18 |
+
i18n_file = Path(".locale")
|
19 |
+
|
20 |
+
if i18n_file.exists():
|
21 |
+
with open(i18n_file, "r", encoding="utf-8") as f:
|
22 |
+
language = f.read().strip()
|
23 |
+
else:
|
24 |
+
# getlocale can't identify the system's language ((None, None))
|
25 |
+
language = locale.getdefaultlocale()[0]
|
26 |
+
|
27 |
+
if (I18N_FILE_PATH / f"{language}.json").exists() is False:
|
28 |
+
language = DEFAULT_LANGUAGE
|
29 |
+
|
30 |
+
self.language = language
|
31 |
+
self.language_map = load_language_list(language)
|
32 |
+
|
33 |
+
def __call__(self, key):
|
34 |
+
return self.language_map.get(key, key)
|
35 |
+
|
36 |
+
def __repr__(self):
|
37 |
+
return "Use Language: " + self.language
|
38 |
+
|
39 |
+
|
40 |
+
i18n = I18nAuto()
|
fish_speech/i18n/locale/en_US.json
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"16-mixed is recommended for 10+ series GPU": "16-mixed is recommended for 10+ series GPU",
|
3 |
+
"5 to 10 seconds of reference audio, useful for specifying speaker.": "5 to 10 seconds of reference audio, useful for specifying speaker.",
|
4 |
+
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).",
|
5 |
+
"Accumulate Gradient Batches": "Accumulate Gradient Batches",
|
6 |
+
"Add to Processing Area": "Add to Processing Area",
|
7 |
+
"Added path successfully!": "Added path successfully!",
|
8 |
+
"Advanced Config": "Advanced Config",
|
9 |
+
"Base LLAMA Model": "Base LLAMA Model",
|
10 |
+
"Batch Inference": "Batch Inference",
|
11 |
+
"Batch Size": "Batch Size",
|
12 |
+
"Changing with the Model Path": "Changing with the Model Path",
|
13 |
+
"Chinese": "Chinese",
|
14 |
+
"Compile Model": "Compile Model",
|
15 |
+
"Compile the model can significantly reduce the inference time, but will increase cold start time": "Compile the model can significantly reduce the inference time, but will increase cold start time",
|
16 |
+
"Copy": "Copy",
|
17 |
+
"Data Preprocessing": "Data Preprocessing",
|
18 |
+
"Data Preprocessing Path": "Data Preprocessing Path",
|
19 |
+
"Data Source": "Data Source",
|
20 |
+
"Decoder Model Config": "Decoder Model Config",
|
21 |
+
"Decoder Model Path": "Decoder Model Path",
|
22 |
+
"Disabled": "Disabled",
|
23 |
+
"Enable Reference Audio": "Enable Reference Audio",
|
24 |
+
"English": "English",
|
25 |
+
"Error Message": "Error Message",
|
26 |
+
"File Preprocessing": "File Preprocessing",
|
27 |
+
"Generate": "Generate",
|
28 |
+
"Generated Audio": "Generated Audio",
|
29 |
+
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format",
|
30 |
+
"Infer interface is closed": "Infer interface is closed",
|
31 |
+
"Inference Configuration": "Inference Configuration",
|
32 |
+
"Inference Server Configuration": "Inference Server Configuration",
|
33 |
+
"Inference Server Error": "Inference Server Error",
|
34 |
+
"Inferring interface is launched at {}": "Inferring interface is launched at {}",
|
35 |
+
"Initial Learning Rate": "Initial Learning Rate",
|
36 |
+
"Input Audio & Source Path for Transcription": "Input Audio & Source Path for Transcription",
|
37 |
+
"Input Text": "Input Text",
|
38 |
+
"Invalid path: {}": "Invalid path: {}",
|
39 |
+
"It is recommended to use CUDA, if you have low configuration, use CPU": "It is recommended to use CUDA, if you have low configuration, use CPU",
|
40 |
+
"Iterative Prompt Length, 0 means off": "Iterative Prompt Length, 0 means off",
|
41 |
+
"Japanese": "Japanese",
|
42 |
+
"LLAMA Configuration": "LLAMA Configuration",
|
43 |
+
"LLAMA Model Config": "LLAMA Model Config",
|
44 |
+
"LLAMA Model Path": "LLAMA Model Path",
|
45 |
+
"Labeling Device": "Labeling Device",
|
46 |
+
"LoRA Model to be merged": "LoRA Model to be merged",
|
47 |
+
"Maximum Audio Duration": "Maximum Audio Duration",
|
48 |
+
"Maximum Length per Sample": "Maximum Length per Sample",
|
49 |
+
"Maximum Training Steps": "Maximum Training Steps",
|
50 |
+
"Maximum tokens per batch, 0 means no limit": "Maximum tokens per batch, 0 means no limit",
|
51 |
+
"Merge": "Merge",
|
52 |
+
"Merge LoRA": "Merge LoRA",
|
53 |
+
"Merge successfully": "Merge successfully",
|
54 |
+
"Minimum Audio Duration": "Minimum Audio Duration",
|
55 |
+
"Model Output Path": "Model Output Path",
|
56 |
+
"Model Size": "Model Size",
|
57 |
+
"Move": "Move",
|
58 |
+
"Move files successfully": "Move files successfully",
|
59 |
+
"No audio generated, please check the input text.": "No audio generated, please check the input text.",
|
60 |
+
"No selected options": "No selected options",
|
61 |
+
"Number of Workers": "Number of Workers",
|
62 |
+
"Open Inference Server": "Open Inference Server",
|
63 |
+
"Open Labeler WebUI": "Open Labeler WebUI",
|
64 |
+
"Open Tensorboard": "Open Tensorboard",
|
65 |
+
"Opened labeler in browser": "Opened labeler in browser",
|
66 |
+
"Optional Label Language": "Optional Label Language",
|
67 |
+
"Optional online ver": "Optional online ver",
|
68 |
+
"Output Path": "Output Path",
|
69 |
+
"Path error, please check the model file exists in the corresponding path": "Path error, please check the model file exists in the corresponding path",
|
70 |
+
"Precision": "Precision",
|
71 |
+
"Probability of applying Speaker Condition": "Probability of applying Speaker Condition",
|
72 |
+
"Put your text here.": "Put your text here.",
|
73 |
+
"Reference Audio": "Reference Audio",
|
74 |
+
"Reference Text": "Reference Text",
|
75 |
+
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "Related code and weights are released under CC BY-NC-SA 4.0 License.",
|
76 |
+
"Remove Selected Data": "Remove Selected Data",
|
77 |
+
"Removed path successfully!": "Removed path successfully!",
|
78 |
+
"Repetition Penalty": "Repetition Penalty",
|
79 |
+
"Save model every n steps": "Save model every n steps",
|
80 |
+
"Select LLAMA ckpt": "Select LLAMA ckpt",
|
81 |
+
"Select VITS ckpt": "Select VITS ckpt",
|
82 |
+
"Select VQGAN ckpt": "Select VQGAN ckpt",
|
83 |
+
"Select source file processing method": "Select source file processing method",
|
84 |
+
"Select the model to be trained (Depending on the Tab page you are on)": "Select the model to be trained (Depending on the Tab page you are on)",
|
85 |
+
"Selected: {}": "Selected: {}",
|
86 |
+
"Speaker": "Speaker",
|
87 |
+
"Speaker is identified by the folder name": "Speaker is identified by the folder name",
|
88 |
+
"Start Training": "Start Training",
|
89 |
+
"Streaming Audio": "Streaming Audio",
|
90 |
+
"Streaming Generate": "Streaming Generate",
|
91 |
+
"Tensorboard Host": "Tensorboard Host",
|
92 |
+
"Tensorboard Log Path": "Tensorboard Log Path",
|
93 |
+
"Tensorboard Port": "Tensorboard Port",
|
94 |
+
"Tensorboard interface is closed": "Tensorboard interface is closed",
|
95 |
+
"Tensorboard interface is launched at {}": "Tensorboard interface is launched at {}",
|
96 |
+
"Text is too long, please keep it under {} characters.": "Text is too long, please keep it under {} characters.",
|
97 |
+
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.",
|
98 |
+
"Training Configuration": "Training Configuration",
|
99 |
+
"Training Error": "Training Error",
|
100 |
+
"Training stopped": "Training stopped",
|
101 |
+
"Type name of the speaker": "Type name of the speaker",
|
102 |
+
"Type the path or select from the dropdown": "Type the path or select from the dropdown",
|
103 |
+
"Use LoRA": "Use LoRA",
|
104 |
+
"Use LoRA can save GPU memory, but may reduce the quality of the model": "Use LoRA can save GPU memory, but may reduce the quality of the model",
|
105 |
+
"Use filelist": "Use filelist",
|
106 |
+
"Use large for 10G+ GPU, medium for 5G, small for 2G": "Use large for 10G+ GPU, medium for 5G, small for 2G",
|
107 |
+
"VITS Configuration": "VITS Configuration",
|
108 |
+
"VQGAN Configuration": "VQGAN Configuration",
|
109 |
+
"Validation Batch Size": "Validation Batch Size",
|
110 |
+
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "View the status of the preprocessing folder (use the slider to control the depth of the tree)",
|
111 |
+
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.",
|
112 |
+
"WebUI Host": "WebUI Host",
|
113 |
+
"WebUI Port": "WebUI Port",
|
114 |
+
"Whisper Model": "Whisper Model",
|
115 |
+
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).",
|
116 |
+
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU",
|
117 |
+
"latest": "latest",
|
118 |
+
"new": "new",
|
119 |
+
"Realtime Transform Text": "Realtime Transform Text",
|
120 |
+
"Normalization Result Preview (Currently Only Chinese)": "Normalization Result Preview (Currently Only Chinese)",
|
121 |
+
"Text Normalization": "Text Normalization"
|
122 |
+
}
|
fish_speech/i18n/locale/es_ES.json
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"16-mixed is recommended for 10+ series GPU": "se recomienda 16-mixed para GPU de la serie 10+",
|
3 |
+
"5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de audio de referencia, útil para especificar el hablante.",
|
4 |
+
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Un modelo de texto a voz basado en VQ-GAN y Llama desarrollado por [Fish Audio](https://fish.audio).",
|
5 |
+
"Accumulate Gradient Batches": "Acumular lotes de gradientes",
|
6 |
+
"Add to Processing Area": "Agregar al Área de Procesamiento",
|
7 |
+
"Added path successfully!": "¡Ruta agregada exitosamente!",
|
8 |
+
"Advanced Config": "Configuración Avanzada",
|
9 |
+
"Base LLAMA Model": "Modelo Base LLAMA",
|
10 |
+
"Batch Inference": "Inferencia por Lote",
|
11 |
+
"Batch Size": "Tamaño del Lote",
|
12 |
+
"Changing with the Model Path": "Cambiando con la Ruta del Modelo",
|
13 |
+
"Chinese": "Chino",
|
14 |
+
"Compile Model": "Compilar Modelo",
|
15 |
+
"Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar el modelo puede reducir significativamente el tiempo de inferencia, pero aumentará el tiempo de inicio en frío",
|
16 |
+
"Copy": "Copiar",
|
17 |
+
"Data Preprocessing": "Preprocesamiento de Datos",
|
18 |
+
"Data Preprocessing Path": "Ruta de Preprocesamiento de Datos",
|
19 |
+
"Data Source": "Fuente de Datos",
|
20 |
+
"Decoder Model Config": "Configuración del modelo decodificador",
|
21 |
+
"Decoder Model Path": "Ruta del modelo decodificador",
|
22 |
+
"Disabled": "Desactivado",
|
23 |
+
"Enable Reference Audio": "Habilitar Audio de Referencia",
|
24 |
+
"English": "Inglés",
|
25 |
+
"Error Message": "Mensaje de Error",
|
26 |
+
"File Preprocessing": "Preprocesamiento de Archivos",
|
27 |
+
"Generate": "Generar",
|
28 |
+
"Generated Audio": "Audio Generado",
|
29 |
+
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Si no hay texto correspondiente para el audio, aplique ASR para asistencia, soporte para formato .txt o .lab",
|
30 |
+
"Infer interface is closed": "La interfaz de inferencia está cerrada",
|
31 |
+
"Inference Configuration": "Configuración de Inferencia",
|
32 |
+
"Inference Server Configuration": "Configuración del Servidor de Inferencia",
|
33 |
+
"Inference Server Error": "Error del Servidor de Inferencia",
|
34 |
+
"Inferring interface is launched at {}": "La interfaz de inferencia se ha lanzado en {}",
|
35 |
+
"Initial Learning Rate": "Tasa de Aprendizaje Inicial",
|
36 |
+
"Input Audio & Source Path for Transcription": "Audio de Entrada y Ruta de Origen para Transcripción",
|
37 |
+
"Input Text": "Texto de Entrada",
|
38 |
+
"Invalid path: {}": "Ruta inválida: {}",
|
39 |
+
"It is recommended to use CUDA, if you have low configuration, use CPU": "Se recomienda usar CUDA, si tiene una configuración baja, use CPU",
|
40 |
+
"Iterative Prompt Length, 0 means off": "Longitud de la Indicación Iterativa, 0 significa apagado",
|
41 |
+
"Japanese": "Japonés",
|
42 |
+
"LLAMA Configuration": "Configuración de LLAMA",
|
43 |
+
"LLAMA Model Config": "Configuración del Modelo LLAMA",
|
44 |
+
"LLAMA Model Path": "Ruta del Modelo LLAMA",
|
45 |
+
"Labeling Device": "Dispositivo de Etiquetado",
|
46 |
+
"LoRA Model to be merged": "Modelo LoRA a fusionar",
|
47 |
+
"Maximum Audio Duration": "Duración máxima de audio",
|
48 |
+
"Maximum Length per Sample": "Longitud Máxima por Muestra",
|
49 |
+
"Maximum Training Steps": "Pasos Máximos de Entrenamiento",
|
50 |
+
"Maximum tokens per batch, 0 means no limit": "Máximo de tokens por lote, 0 significa sin límite",
|
51 |
+
"Merge": "Fusionar",
|
52 |
+
"Merge LoRA": "Fusionar LoRA",
|
53 |
+
"Merge successfully": "Fusionado exitosamente",
|
54 |
+
"Minimum Audio Duration": "Duración mínima de audio",
|
55 |
+
"Model Output Path": "Ruta de Salida del Modelo",
|
56 |
+
"Model Size": "Tamaño del Modelo",
|
57 |
+
"Move": "Mover",
|
58 |
+
"Move files successfully": "Archivos movidos exitosamente",
|
59 |
+
"No audio generated, please check the input text.": "No se generó audio, por favor verifique el texto de entrada.",
|
60 |
+
"No selected options": "No hay opciones seleccionadas",
|
61 |
+
"Number of Workers": "Número de Trabajadores",
|
62 |
+
"Open Inference Server": "Abrir Servidor de Inferencia",
|
63 |
+
"Open Labeler WebUI": "Abrir Interfaz Web del Etiquetador",
|
64 |
+
"Open Tensorboard": "Abrir Tensorboard",
|
65 |
+
"Opened labeler in browser": "Se abrió el etiquetador en el navegador",
|
66 |
+
"Optional Label Language": "Idioma de Etiquetado Opcional",
|
67 |
+
"Optional online ver": "Ver en línea opcional",
|
68 |
+
"Output Path": "Ruta de Salida",
|
69 |
+
"Path error, please check the model file exists in the corresponding path": "Error de ruta, por favor verifique que el archivo del modelo exista en la ruta correspondiente",
|
70 |
+
"Precision": "Precisión",
|
71 |
+
"Probability of applying Speaker Condition": "Probabilidad de aplicar Condición de Hablante",
|
72 |
+
"Put your text here.": "Ponga su texto aquí.",
|
73 |
+
"Reference Audio": "Audio de Referencia",
|
74 |
+
"Reference Text": "Texto de Referencia",
|
75 |
+
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
|
76 |
+
"Remove Selected Data": "Eliminar Datos Seleccionados",
|
77 |
+
"Removed path successfully!": "¡Ruta eliminada exitosamente!",
|
78 |
+
"Repetition Penalty": "Penalización por Repetición",
|
79 |
+
"Save model every n steps": "Guardar modelo cada n pasos",
|
80 |
+
"Select LLAMA ckpt": "Seleccionar punto de control LLAMA",
|
81 |
+
"Select VITS ckpt": "Seleccionar punto de control VITS",
|
82 |
+
"Select VQGAN ckpt": "Seleccionar punto de control VQGAN",
|
83 |
+
"Select source file processing method": "Seleccione el método de procesamiento de archivos fuente",
|
84 |
+
"Select the model to be trained (Depending on the Tab page you are on)": "Seleccione el modelo a entrenar (Dependiendo de la pestaña en la que se encuentre)",
|
85 |
+
"Selected: {}": "Seleccionado: {}",
|
86 |
+
"Speaker": "Hablante",
|
87 |
+
"Speaker is identified by the folder name": "El hablante se identifica por el nombre de la carpeta",
|
88 |
+
"Start Training": "Iniciar Entrenamiento",
|
89 |
+
"Streaming Audio": "transmisión de audio",
|
90 |
+
"Streaming Generate": "síntesis en flujo",
|
91 |
+
"Tensorboard Host": "Host de Tensorboard",
|
92 |
+
"Tensorboard Log Path": "Ruta de Registro de Tensorboard",
|
93 |
+
"Tensorboard Port": "Puerto de Tensorboard",
|
94 |
+
"Tensorboard interface is closed": "La interfaz de Tensorboard está cerrada",
|
95 |
+
"Tensorboard interface is launched at {}": "La interfaz de Tensorboard se ha lanzado en {}",
|
96 |
+
"Text is too long, please keep it under {} characters.": "El texto es demasiado largo, por favor manténgalo por debajo de {} caracteres.",
|
97 |
+
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "La ruta de la carpeta de entrada a la izquierda o la lista de archivos. Ya sea que esté marcado o no, se utilizará para el entrenamiento posterior en esta lista.",
|
98 |
+
"Training Configuration": "Configuración de Entrenamiento",
|
99 |
+
"Training Error": "Error de Entrenamiento",
|
100 |
+
"Training stopped": "Entrenamiento detenido",
|
101 |
+
"Type name of the speaker": "Escriba el nombre del hablante",
|
102 |
+
"Type the path or select from the dropdown": "Escriba la ruta o seleccione de la lista desplegable",
|
103 |
+
"Use LoRA": "Usar LoRA",
|
104 |
+
"Use LoRA can save GPU memory, but may reduce the quality of the model": "Usar LoRA puede ahorrar memoria GPU, pero puede reducir la calidad del modelo",
|
105 |
+
"Use filelist": "Usar lista de archivos",
|
106 |
+
"Use large for 10G+ GPU, medium for 5G, small for 2G": "Use grande para GPU de 10G+, mediano para 5G, pequeño para 2G",
|
107 |
+
"VITS Configuration": "Configuración de VITS",
|
108 |
+
"VQGAN Configuration": "Configuración de VQGAN",
|
109 |
+
"Validation Batch Size": "Tamaño del Lote de Validación",
|
110 |
+
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Vea el estado de la carpeta de preprocesamiento (use el control deslizante para controlar la profundidad del árbol)",
|
111 |
+
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "No somos responsables de ningún mal uso del modelo, por favor considere sus leyes y regulaciones locales antes de usarlo.",
|
112 |
+
"WebUI Host": "Host de WebUI",
|
113 |
+
"WebUI Port": "Puerto de WebUI",
|
114 |
+
"Whisper Model": "Modelo Whisper",
|
115 |
+
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Puede encontrar el código fuente [aquí](https://github.com/fishaudio/fish-speech) y los modelos [aquí](https://huggingface.co/fishaudio/fish-speech-1).",
|
116 |
+
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "Se recomienda bf16-true para GPU de la serie 30+, se recomienda 16-mixed para GPU de la serie 10+",
|
117 |
+
"latest": "más reciente",
|
118 |
+
"new": "nuevo",
|
119 |
+
"Realtime Transform Text": "Transformación de Texto en Tiempo Real",
|
120 |
+
"Normalization Result Preview (Currently Only Chinese)": "Vista Previa del Resultado de Normalización (Actualmente Solo Chino)",
|
121 |
+
"Text Normalization": "Normalización de Texto"
|
122 |
+
}
|
fish_speech/i18n/locale/ja_JP.json
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"16-mixed is recommended for 10+ series GPU": "10シリーズ以降のGPUには16-mixedをお勧めします",
|
3 |
+
"5 to 10 seconds of reference audio, useful for specifying speaker.": "話者を指定するのに役立つ、5~10秒のリファレンスオーディオ。",
|
4 |
+
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)が開発したVQ-GANとLlamaに基づくテキスト音声合成モデル。",
|
5 |
+
"Accumulate Gradient Batches": "勾配バッチの累積",
|
6 |
+
"Add to Processing Area": "処理エリアに追加",
|
7 |
+
"Added path successfully!": "パスの追加に成功しました!",
|
8 |
+
"Advanced Config": "詳細設定",
|
9 |
+
"Base LLAMA Model": "基本LLAMAモデル",
|
10 |
+
"Batch Inference": "バッチ推論",
|
11 |
+
"Batch Size": "バッチサイズ",
|
12 |
+
"Changing with the Model Path": "モデルのパスに伴って変化する",
|
13 |
+
"Chinese": "中国語",
|
14 |
+
"Compile Model": "モデルのコンパイル",
|
15 |
+
"Compile the model can significantly reduce the inference time, but will increase cold start time": "モデルをコンパイルすると推論時間を大幅に短縮できますが、コールドスタート時間が長くなります",
|
16 |
+
"Copy": "コピー",
|
17 |
+
"Data Preprocessing": "データ前処理",
|
18 |
+
"Data Preprocessing Path": "データ前処理パス",
|
19 |
+
"Data Source": "データソース",
|
20 |
+
"Decoder Model Config": "デコーダーモデルの構成",
|
21 |
+
"Decoder Model Path": "デコーダーモデルのパス",
|
22 |
+
"Disabled": "無効",
|
23 |
+
"Enable Reference Audio": "リファレンスオーディオを有効にする",
|
24 |
+
"English": "英語",
|
25 |
+
"Error Message": "エラーメッセージ",
|
26 |
+
"File Preprocessing": "文書前处理",
|
27 |
+
"Generate": "生成",
|
28 |
+
"Generated Audio": "生成されたオーディオ",
|
29 |
+
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "音声に対応するテキストがない場合は、ASRを適用してサポートします。.txtまたは.lab形式をサポートしています",
|
30 |
+
"Infer interface is closed": "推論インターフェースが閉じられています",
|
31 |
+
"Inference Configuration": "推論設定",
|
32 |
+
"Inference Server Configuration": "推論サーバー設定",
|
33 |
+
"Inference Server Error": "推論サーバーエラー",
|
34 |
+
"Inferring interface is launched at {}": "推論インターフェースが{}で起動しました",
|
35 |
+
"Initial Learning Rate": "初期学習率",
|
36 |
+
"Input Audio & Source Path for Transcription": "入力オーディオと文字起こしのソースパス",
|
37 |
+
"Input Text": "入力テキスト",
|
38 |
+
"Invalid path: {}": "無効なパス: {}",
|
39 |
+
"It is recommended to use CUDA, if you have low configuration, use CPU": "CUDAの使用をお勧めします。低い構成の場合はCPUを使用してください",
|
40 |
+
"Iterative Prompt Length, 0 means off": "反復プロンプト長。0はオフを意味します",
|
41 |
+
"Japanese": "日本語",
|
42 |
+
"LLAMA Configuration": "LLAMA設定",
|
43 |
+
"LLAMA Model Config": "LLAMAモデル設定",
|
44 |
+
"LLAMA Model Path": "LLAMAモデルパス",
|
45 |
+
"Labeling Device": "ラベリングデバイス",
|
46 |
+
"LoRA Model to be merged": "マージするLoRAモデル",
|
47 |
+
"Maximum Audio Duration": "最大オーディオの長さ",
|
48 |
+
"Maximum Length per Sample": "サンプルあたりの最大長",
|
49 |
+
"Maximum Training Steps": "最大トレーニングステップ数",
|
50 |
+
"Maximum tokens per batch, 0 means no limit": "バッチあたりの最大トークン数。0は制限なしを意味します",
|
51 |
+
"Merge": "マージ",
|
52 |
+
"Merge LoRA": "LoRAのマージ",
|
53 |
+
"Merge successfully": "マージに成功しました",
|
54 |
+
"Minimum Audio Duration": "最小オーディオの長さ",
|
55 |
+
"Model Output Path": "モデル出力パス",
|
56 |
+
"Model Size": "モデルサイズ",
|
57 |
+
"Move": "移動",
|
58 |
+
"Move files successfully": "ファイルの移動に成功しました",
|
59 |
+
"No audio generated, please check the input text.": "オーディオが生成されていません。入力テキストを確認してください。",
|
60 |
+
"No selected options": "選択されたオプションはありません",
|
61 |
+
"Number of Workers": "ワーカー数",
|
62 |
+
"Open Inference Server": "推論サーバーを開く",
|
63 |
+
"Open Labeler WebUI": "ラベラーWebUIを開く",
|
64 |
+
"Open Tensorboard": "Tensorboardを開く",
|
65 |
+
"Opened labeler in browser": "ブラウザでラベラーを開きました",
|
66 |
+
"Optional Label Language": "オプションのラベル言語",
|
67 |
+
"Optional online ver": "オプションのオンラインバージョン",
|
68 |
+
"Output Path": "出力パス",
|
69 |
+
"Path error, please check the model file exists in the corresponding path": "パスエラー。対応するパスにモデルファイルが存在するか確認してください",
|
70 |
+
"Precision": "精度",
|
71 |
+
"Probability of applying Speaker Condition": "話者条件を適用する確率",
|
72 |
+
"Put your text here.": "ここにテキストを入力してください。",
|
73 |
+
"Reference Audio": "リファレンスオーディオ",
|
74 |
+
"Reference Text": "リファレンステキスト",
|
75 |
+
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "関連コードと重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
|
76 |
+
"Remove Selected Data": "選択したデータを削除",
|
77 |
+
"Removed path successfully!": "パスの削除に成功しました!",
|
78 |
+
"Repetition Penalty": "反復ペナルティ",
|
79 |
+
"Save model every n steps": "nステップごとにモデルを保存",
|
80 |
+
"Select LLAMA ckpt": " LLAMA チェックポイントを選択",
|
81 |
+
"Select VITS ckpt": "VITS チェックポイントを選択",
|
82 |
+
"Select VQGAN ckpt": "VQGAN チェックポイントを選択",
|
83 |
+
"Select source file processing method": "ソースファイルの処理方法を選択",
|
84 |
+
"Select the model to be trained (Depending on the Tab page you are on)": "タブページに応じてトレーニングするモデルを選択してください",
|
85 |
+
"Selected: {}": "選択済み: {}",
|
86 |
+
"Speaker": "話者",
|
87 |
+
"Speaker is identified by the folder name": "話者はフォルダ名で識別されます",
|
88 |
+
"Start Training": "トレーニング開始",
|
89 |
+
"Streaming Audio": "ストリーミングオーディオ",
|
90 |
+
"Streaming Generate": "ストリーミング合成",
|
91 |
+
"Tensorboard Host": "Tensorboardホスト",
|
92 |
+
"Tensorboard Log Path": "Tensorboardログパス",
|
93 |
+
"Tensorboard Port": "Tensorboardポート",
|
94 |
+
"Tensorboard interface is closed": "Tensorboardインターフェースが閉じられています",
|
95 |
+
"Tensorboard interface is launched at {}": "Tensorboardインターフェースが{}で起動されました",
|
96 |
+
"Text is too long, please keep it under {} characters.": "テキストが長すぎます。{}文字以内に抑えてください。",
|
97 |
+
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左側の入力フォルダまたはファイルリストのパス。チェックの有無にかかわらず、このリストの後続のトレーニングに使用されます。",
|
98 |
+
"Training Configuration": "トレーニング設定",
|
99 |
+
"Training Error": "トレーニングエラー",
|
100 |
+
"Training stopped": "トレーニングが停止しました",
|
101 |
+
"Type name of the speaker": "話者の名前を入力",
|
102 |
+
"Type the path or select from the dropdown": "パスを入力するか、ドロップダウンから選択してください",
|
103 |
+
"Use LoRA": "LoRAを使用",
|
104 |
+
"Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRAを使用するとGPUメモリを節約できますが、モデルの品質が低下する可能性があります",
|
105 |
+
"Use filelist": "ファイルリストを使用",
|
106 |
+
"Use large for 10G+ GPU, medium for 5G, small for 2G": "10G以上のGPUには大、5Gには中、2Gには小を使用してください",
|
107 |
+
"VITS Configuration": "VITS の構成",
|
108 |
+
"VQGAN Configuration": "VQGAN の構成",
|
109 |
+
"Validation Batch Size": "検証バッチサイズ",
|
110 |
+
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "前処理フォルダの状態を表示(スライダーを使用してツリーの深さを制御)",
|
111 |
+
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "モデルの誤用については一切責任を負いません。使用する前に、現地の法律と規制を考慮してください。",
|
112 |
+
"WebUI Host": "WebUIホスト",
|
113 |
+
"WebUI Port": "WebUIポート",
|
114 |
+
"Whisper Model": "Whisperモデル",
|
115 |
+
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "ソースコードは[こちら](https://github.com/fishaudio/fish-speech)、モデルは[こちら](https://huggingface.co/fishaudio/fish-speech-1)にあります。",
|
116 |
+
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30シリーズ以降のGPUにはbf16-trueを、10シリーズ以降のGPUには16-mixedをお勧めします",
|
117 |
+
"latest": "最新",
|
118 |
+
"new": "新規",
|
119 |
+
"Realtime Transform Text": "リアルタイム変換テキスト",
|
120 |
+
"Normalization Result Preview (Currently Only Chinese)": "正規化結果プレビュー(現在は中国語のみ)",
|
121 |
+
"Text Normalization": "テキスト正規化"
|
122 |
+
|
123 |
+
}
|
fish_speech/i18n/locale/pt_BR.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de áudio de referência, útil para especificar o orador.",
|
3 |
+
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Um modelo de texto para fala baseado em VQ-GAN e Llama desenvolvido por [Fish Audio](https://fish.audio).",
|
4 |
+
"Accumulate Gradient Batches": "Acumular Lotes de Gradiente",
|
5 |
+
"Add to Processing Area": "Adicionar à Área de Processamento",
|
6 |
+
"Added path successfully!": "Caminho adicionado com sucesso!",
|
7 |
+
"Advanced Config": "Configuração Avançada",
|
8 |
+
"Base LLAMA Model": "Modelo LLAMA Base",
|
9 |
+
"Batch Inference": "Inferência em Lote",
|
10 |
+
"Batch Size": "Tamanho do Lote",
|
11 |
+
"Changing with the Model Path": "Alterando com o Caminho do Modelo",
|
12 |
+
|
13 |
+
"Compile Model": "Compilar Modelo",
|
14 |
+
"Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar o modelo pode reduzir significativamente o tempo de inferência, mas aumentará a latência inicial",
|
15 |
+
"Copy": "Copiar",
|
16 |
+
"Data Preprocessing": "Pré-processamento de Dados",
|
17 |
+
"Data Preprocessing Path": "Caminho de Pré-processamento de Dados",
|
18 |
+
"Data Source": "Fonte de Dados",
|
19 |
+
"Decoder Model Config": "Configuração do Modelo Decodificador",
|
20 |
+
"Decoder Model Path": "Caminho do Modelo Decodificador",
|
21 |
+
"Disabled": "Desativado",
|
22 |
+
"Enable Initial Prompt": "Habilitar Prompt Inicial",
|
23 |
+
"Enable Reference Audio": "Habilitar Áudio de Referência",
|
24 |
+
"English": "Inglês",
|
25 |
+
"Japanese": "Japonês",
|
26 |
+
"Chinese": "Chinês",
|
27 |
+
"Portuguese": "Português",
|
28 |
+
"Spanish": "Espanhol",
|
29 |
+
"Error Message": "Mensagem de Erro",
|
30 |
+
"Faster Whisper, Up to 5g GPU memory usage": "Faster Whisper (Usa até 5 GB de vRAM)",
|
31 |
+
"File Preprocessing": "Pré-processamento de Arquivos",
|
32 |
+
"Generate": "Gerar",
|
33 |
+
"Generated Audio": "Áudio Gerado",
|
34 |
+
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Se não houver texto correspondente ao áudio, utilize o ASR para assistência (formatos .txt ou .lab)",
|
35 |
+
"Infer interface is closed": "A interface de inferência foi fechada",
|
36 |
+
"Inference Configuration": "Configuração de Inferência",
|
37 |
+
"Inference Server Configuration": "Configuração do Servidor de Inferência",
|
38 |
+
"Inference Server Error": "Erro do Servidor de Inferência",
|
39 |
+
"Inferring interface is launched at {}": "A interface de inferência foi iniciada em {}",
|
40 |
+
"Initial Learning Rate": "Taxa de Aprendizagem Inicial",
|
41 |
+
"Initial Prompt": "Prompt Inicial",
|
42 |
+
"Initial prompt can provide contextual or vocabulary-specific guidance to the model.": "O prompt inicial pode fornecer orientação contextual ou específica de vocabulário para o modelo.",
|
43 |
+
"Input Audio & Source Path for Transcription": "Entrada de Áudio/Caminho de Origem para Transcrição",
|
44 |
+
"Input Text": "Texto de Entrada",
|
45 |
+
"Invalid path: {}": "Caminho inválido: {}",
|
46 |
+
"It is recommended to use CUDA, if you have low configuration, use CPU": "Para GPUs Nvidia é recomendado usar CUDA. Se não tiver uma GPU Nvidia, use CPU",
|
47 |
+
"Iterative Prompt Length, 0 means off": "Comprimento do Prompt Iterativo (0 = desativado)",
|
48 |
+
"LLAMA Configuration": "Configuração do LLAMA",
|
49 |
+
"LLAMA Model Config": "Configuração do Modelo LLAMA",
|
50 |
+
"LLAMA Model Path": "Caminho do Modelo LLAMA",
|
51 |
+
"Labeling Device": "Dispositivo de Rotulagem",
|
52 |
+
"LoRA Model to be merged": "Modelo LoRA para mesclagem",
|
53 |
+
"Maximum Length per Sample": "Comprimento Máximo por Amostra",
|
54 |
+
"Maximum Training Steps": "Etapas Máximas de Treinamento",
|
55 |
+
"Maximum tokens per batch, 0 means no limit": "Número máximo de tokens por lote, 0 significa sem limite",
|
56 |
+
"Merge": "Mesclar",
|
57 |
+
"Merge LoRA": "Mesclar LoRA",
|
58 |
+
"Merge successfully": "Mesclado com sucesso",
|
59 |
+
"Model Output Path": "Caminho de Saída do Modelo",
|
60 |
+
"Model Quantization": "Quantização do Modelo",
|
61 |
+
"Model Size": "Tamanho do Modelo",
|
62 |
+
"Move": "Mover",
|
63 |
+
"Move files successfully": "Arquivos movidos com sucesso",
|
64 |
+
"No audio generated, please check the input text.": "Nenhum áudio gerado, verifique o texto de entrada.",
|
65 |
+
"No selected options": "Nenhuma opção selecionada",
|
66 |
+
"Normalization Result Preview (Currently Only Chinese)": "Pré-visualização do Resultado da Normalização (Atualmente Apenas Chinês)",
|
67 |
+
"Number of Workers": "Número de Processos",
|
68 |
+
"Open Inference Server": "Abrir Servidor de Inferência",
|
69 |
+
"Open Labeler WebUI": "Abrir WebUI de Rotulagem",
|
70 |
+
"Open Tensorboard": "Abrir Tensorboard",
|
71 |
+
"Opened labeler in browser": "WebUI de rotulagem aberta no navegador",
|
72 |
+
"Optional Label Language": "Idioma do Rótulo (Opcional)",
|
73 |
+
"Optional online ver": "Versão online (opcional)",
|
74 |
+
"Output Path": "Caminho de Saída",
|
75 |
+
"Path error, please check the model file exists in the corresponding path": "Erro de caminho, verifique se o arquivo do modelo existe no caminho correspondente",
|
76 |
+
"Post-quantification Precision": "Precisão Pós-quantização",
|
77 |
+
"Precision": "Precisão",
|
78 |
+
"Probability of applying Speaker Condition": "Probabilidade de Aplicar Condição de Orador",
|
79 |
+
"Put your text here.": "Insira seu texto aqui.",
|
80 |
+
"Quantify": "Quantizar",
|
81 |
+
"Quantify successfully": "Quantizado com sucesso",
|
82 |
+
"Realtime Transform Text": "Transformar Texto em Tempo Real",
|
83 |
+
"Reference Audio": "Áudio de Referência",
|
84 |
+
"Reference Text": "Texto de Referência",
|
85 |
+
"warning": "Aviso",
|
86 |
+
"Pre-processing begins...": "O pré-processamento começou!",
|
87 |
+
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado e os pesos são licenciados sob a Licença CC BY-NC-SA 4.0.",
|
88 |
+
"Remove Selected Data": "Remover Dados Selecionados",
|
89 |
+
"Removed path successfully!": "Caminho removido com sucesso!",
|
90 |
+
"Repetition Penalty": "Penalidade de Repetição",
|
91 |
+
"Save model every n steps": "Salvar modelo a cada n etapas",
|
92 |
+
"Select LLAMA ckpt": "Selecionar .ckpt do LLAMA",
|
93 |
+
"Select source file processing method": "Escolha como processar o arquivo de origem",
|
94 |
+
"Select the model to be trained (Depending on the Tab page you are on)": "Selecione o modelo para o treinamento (dependendo da aba em que você está)",
|
95 |
+
"Selected: {}": "Selecionado: {}",
|
96 |
+
"Speaker is identified by the folder name": "O orador é identificado pelo nome da pasta",
|
97 |
+
"Start Training": "Iniciar Treinamento",
|
98 |
+
"Streaming Audio": "Áudio em Streaming",
|
99 |
+
"Streaming Generate": "Geração em Streaming",
|
100 |
+
"Tensorboard Host": "Host do Tensorboard",
|
101 |
+
"Tensorboard Log Path": "Caminho de Log do Tensorboard",
|
102 |
+
"Tensorboard Port": "Porta do Tensorboard",
|
103 |
+
"Tensorboard interface is closed": "A interface do Tensorboard está fechada",
|
104 |
+
"Tensorboard interface is launched at {}": "A interface do Tensorboard foi iniciada em {}",
|
105 |
+
"Text Normalization": "Normalização de Texto",
|
106 |
+
"Text is too long, please keep it under {} characters.": "O texto é muito longo. Mantenha-o com menos de {} caracteres.",
|
107 |
+
"The lower the quantitative precision, the more the effectiveness may decrease, but the greater the efficiency will increase": "Quanto menor a precisão quantitativa, mais a eficácia pode diminuir, mas maior será o aumento da eficiência",
|
108 |
+
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "O caminho da pasta de entrada à esquerda ou a lista de arquivos. Independentemente de estar marcada ou não, ela será utilizada para o treinamento subsequente nesta lista.",
|
109 |
+
"Training Configuration": "Configuração de Treinamento",
|
110 |
+
"Training Error": "Erro de Treinamento",
|
111 |
+
"Training stopped": "Treinamento interrompido!",
|
112 |
+
"Type the path or select from the dropdown": "Digite o caminho ou selecione no menu suspenso",
|
113 |
+
"Use LoRA": "Usar LoRA",
|
114 |
+
"Use LoRA can save GPU memory, but may reduce the quality of the model": "O uso de LoRAs pode economizar memória da GPU, mas também pode reduzir a qualidade",
|
115 |
+
"Use filelist": "Usar lista de arquivos",
|
116 |
+
"VQGAN Configuration": "Configuração do VQGAN",
|
117 |
+
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Visualizar o status da pasta de pré-processamento (use o controle deslizante para controlar a profundidade da árvore)",
|
118 |
+
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "Não nos responsabilizamos por qualquer uso indevido do modelo. Por favor, considere as leis e regulamentações locais antes de usá-lo.",
|
119 |
+
"WebUI Host": "Host da WebUI",
|
120 |
+
"WebUI Port": "Porta da WebUI",
|
121 |
+
"Whisper Model": "Modelo Whisper",
|
122 |
+
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Você pode encontrar o código fonte [aqui](https://github.com/fishaudio/fish-speech) e os modelos [aqui](https://huggingface.co/fishaudio/fish-speech-1).",
|
123 |
+
"auto": "automático",
|
124 |
+
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true é recomendado para GPUs da série 30+, 16-mixed é recomendado para GPUs da série 10+",
|
125 |
+
"latest": "mais recente",
|
126 |
+
"new": "novo",
|
127 |
+
"This audio introduces the basic concepts and applications of artificial intelligence and machine learning.": "Este áudio introduz os conceitos básicos e aplicações de inteligência artificial e aprendizado de máquina.",
|
128 |
+
"You don't need to train this model!": "Não é necessário treinar este modelo!",
|
129 |
+
"Yes": "Sim",
|
130 |
+
"No": "Não",
|
131 |
+
"version:": "versão:",
|
132 |
+
"author:": "autor:"
|
133 |
+
}
|