Spaces:

fishaudio
/

fish-speech-1

Running on A10G

App Files Files Community

PoTaTo721 commited on 23 days ago

Commit

b2eb230

•

1 Parent(s): b3355c2

Update to V1.5

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +286 -340
fish_speech/callbacks/__init__.py +3 -3
fish_speech/callbacks/grad_norm.py +113 -113
fish_speech/configs/base.yaml +87 -87
fish_speech/configs/firefly_gan_vq.yaml +33 -33
fish_speech/configs/lora/r_8_alpha_16.yaml +4 -4
fish_speech/configs/model/dual_ar_2_codebook_large.yaml +0 -9
fish_speech/configs/model/dual_ar_2_codebook_medium.yaml +0 -9
fish_speech/configs/model/dual_ar_2_codebook_small.yaml +0 -13
fish_speech/configs/model/naive_2_codebook_small.yaml +0 -12
fish_speech/configs/text2semantic_finetune.yaml +83 -83
fish_speech/configs/text2semantic_finetune_lora.yaml +0 -13
fish_speech/configs/text2semantic_pretrain.yaml +0 -74
fish_speech/configs/text2semantic_sft.yaml +0 -87
fish_speech/configs/vqgan_finetune.yaml +0 -135
fish_speech/configs/vqgan_pretrain.yaml +0 -139
fish_speech/conversation.py +267 -2
fish_speech/datasets/concat_repeat.py +53 -53
fish_speech/datasets/protos/text-data.proto +24 -24
fish_speech/datasets/protos/text_data_pb2.py +33 -33
fish_speech/datasets/protos/text_data_stream.py +36 -36
fish_speech/datasets/semantic.py +496 -496
fish_speech/datasets/text.py +0 -661
fish_speech/datasets/vqgan.py +147 -147
fish_speech/i18n/README.md +27 -27
fish_speech/i18n/__init__.py +3 -3
fish_speech/i18n/core.py +40 -40
fish_speech/i18n/locale/en_US.json +123 -122
fish_speech/i18n/locale/es_ES.json +123 -122
fish_speech/i18n/locale/ja_JP.json +123 -123
fish_speech/i18n/locale/ko_KR.json +123 -0
fish_speech/i18n/locale/pt_BR.json +133 -133
fish_speech/i18n/locale/zh_CN.json +123 -122
fish_speech/i18n/scan.py +122 -122
fish_speech/models/text2semantic/lit_module.py +202 -202
fish_speech/models/text2semantic/llama.py +887 -779
fish_speech/models/text2semantic/lora.py +92 -92
fish_speech/models/vqgan/lit_module.py +0 -442
fish_speech/models/vqgan/modules/discriminator.py +0 -44
fish_speech/models/vqgan/modules/firefly.py +596 -596
fish_speech/models/vqgan/modules/fsq.py +116 -116
fish_speech/models/vqgan/modules/reference.py +0 -113
fish_speech/models/vqgan/modules/wavenet.py +0 -225
fish_speech/models/vqgan/spectrogram.py +0 -122
fish_speech/models/vqgan/utils.py +94 -94
fish_speech/scheduler.py +40 -40
fish_speech/text/__init__.py +4 -4
fish_speech/text/chn_text_norm/.gitignore +114 -114
fish_speech/text/chn_text_norm/README.md +36 -36
fish_speech/text/chn_text_norm/basic_class.py +172 -172

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import gc
 # Download if not exists
 os.makedirs("checkpoints", exist_ok=True)
-snapshot_download(repo_id="fishaudio/fish-speech-1.4", local_dir="./checkpoints/fish-speech-1.4")
 print("All checkpoints downloaded")
@@ -31,11 +31,11 @@ torchaudio.set_audio_backend("soundfile")
 from loguru import logger
 from transformers import AutoTokenizer
-from tools.llama.generate import launch_thread_safe_queue
-from tools.vqgan.inference import load_model as load_vqgan_model
 from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
 from tools.api import decode_vq_tokens, encode_reference
-from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model
 from tools.llama.generate import (
     GenerateRequest,
     GenerateResponse,
@@ -44,20 +44,43 @@ from tools.llama.generate import (
 )
 from tools.vqgan.inference import load_model as load_decoder_model
 # Make einx happy
 os.environ["EINX_FILTER_TRACEBACK"] = "false"
 HEADER_MD = """# Fish Speech
-## The demo in this space is version 1.4, Please check [Fish Audio](https://fish.audio) for the best model.
-## 该 Demo 为 Fish Speech 1.4 版本, 请在 [Fish Audio](https://fish.audio) 体验最新 DEMO.
 A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).
 由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.
-You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.4).
-你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1.4) 找到模型.
 Related code and weights are released under CC BY-NC-SA 4.0 License.
 相关代码，权重使用 CC BY-NC-SA 4.0 许可证发布.
@@ -65,8 +88,8 @@ Related code and weights are released under CC BY-NC-SA 4.0 License.
 We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.
 我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.
-The model running in this WebUI is Fish Speech V1.4 Medium.
-在此 WebUI 中运行的模型是 Fish Speech V1.4 Medium.
 """
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
@@ -95,48 +118,77 @@ def build_html_error_message(error):
 @GPU_DECORATOR
 @torch.inference_mode()
-def inference(
-    text,
-    enable_reference_audio,
-    reference_audio,
-    reference_text,
-    max_new_tokens,
-    chunk_length,
-    top_p,
-    repetition_penalty,
-    temperature,
-    streaming=False
-):
-    if args.max_gradio_length > 0 and len(text) > args.max_gradio_length:
-        return (
-            None,
-            None,
-            "Text is too long, please keep it under {} characters.".format(
-                args.max_gradio_length
-            ),
         )
-    # Parse reference audio aka prompt
-    prompt_tokens = encode_reference(
-        decoder_model=decoder_model,
-        reference_audio=reference_audio,
-        enable_reference_audio=enable_reference_audio,
-    )
     # LLAMA Inference
     request = dict(
         device=decoder_model.device,
-        max_new_tokens=max_new_tokens,
-        text=text,
-        top_p=top_p,
-        repetition_penalty=repetition_penalty,
-        temperature=temperature,
         compile=args.compile,
-        iterative_prompt=chunk_length > 0,
-        chunk_length=chunk_length,
-        max_length=2048,
-        prompt_tokens=prompt_tokens if enable_reference_audio else None,
-        prompt_text=reference_text if enable_reference_audio else None,
     )
     response_queue = queue.Queue()
@@ -152,19 +204,15 @@ def inference(
     while True:
         result: WrappedGenerateResponse = response_queue.get()
         if result.status == "error":
-            return None, None, build_html_error_message(result.response)
         result: GenerateResponse = result.response
         if result.action == "next":
             break
-        with torch.autocast(
-            device_type=(
-                "cpu"
-                if decoder_model.device.type == "mps"
-                else decoder_model.device.type
-            ),
-            dtype=args.precision,
         ):
             fake_audios = decode_vq_tokens(
                 decoder_model=decoder_model,
@@ -179,79 +227,24 @@ def inference(
             None,
             None,
             build_html_error_message(
-                "No audio generated, please check the input text."
             ),
         )
-    # Return the final audio
     audio = np.concatenate(segments, axis=0)
-    return None, (decoder_model.spec_transform.sample_rate, audio), None
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
         gc.collect()
-def inference_with_auto_rerank(
-    text,
-    enable_reference_audio,
-    reference_audio,
-    reference_text,
-    max_new_tokens,
-    chunk_length,
-    top_p,
-    repetition_penalty,
-    temperature,
-    use_auto_rerank,
-    streaming=False,
-):
-    max_attempts = 2 if use_auto_rerank else 1
-    best_wer = float("inf")
-    best_audio = None
-    best_sample_rate = None
-    for attempt in range(max_attempts):
-        _, (sample_rate, audio), message = inference(
-            text,
-            enable_reference_audio,
-            reference_audio,
-            reference_text,
-            max_new_tokens,
-            chunk_length,
-            top_p,
-            repetition_penalty,
-            temperature,
-            streaming=False,
-        )
-        if audio is None:
-            return None, None, message
-        if not use_auto_rerank:
-            return None, (sample_rate, audio), None
-        asr_result = batch_asr(asr_model, [audio], sample_rate)[0]
-        wer = calculate_wer(text, asr_result["text"])
-        if wer <= 0.3 and not asr_result["huge_gap"]:
-            return None, (sample_rate, audio), None
-        if wer < best_wer:
-            best_wer = wer
-            best_audio = audio
-            best_sample_rate = sample_rate
-        if attempt == max_attempts - 1:
-            break
-    return None, (best_sample_rate, best_audio), None
 n_audios = 4
 global_audio_list = []
 global_error_list = []
 def inference_wrapper(
     text,
     enable_reference_audio,
@@ -262,14 +255,14 @@ def inference_wrapper(
     top_p,
     repetition_penalty,
     temperature,
     batch_infer_num,
-    if_load_asr_model,
 ):
     audios = []
     errors = []
     for _ in range(batch_infer_num):
-        result = inference_with_auto_rerank(
             text,
             enable_reference_audio,
             reference_audio,
@@ -279,10 +272,10 @@ def inference_wrapper(
             top_p,
             repetition_penalty,
             temperature,
-            if_load_asr_model,
         )
-        _, audio_data, error_message = result
         audios.append(
             gr.Audio(value=audio_data if audio_data else None, visible=True),
@@ -314,52 +307,17 @@ def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
     buffer.close()
     return wav_header_bytes
 def normalize_text(user_input, use_normalization):
     if use_normalization:
         return ChnNormedText(raw_text=user_input).normalize()
     else:
         return user_input
-asr_model = None
-def change_if_load_asr_model(if_load):
-    global asr_model
-    if if_load:
-        gr.Warning("Loading faster whisper model...")
-        if asr_model is None:
-            asr_model = load_model()
-        return gr.Checkbox(label="Unload faster whisper model", value=if_load)
-    if if_load is False:
-        gr.Warning("Unloading faster whisper model...")
-        del asr_model
-        asr_model = None
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            gc.collect()
-        return gr.Checkbox(label="Load faster whisper model", value=if_load)
-def change_if_auto_label(if_load, if_auto_label, enable_ref, ref_audio, ref_text):
-    if if_load and asr_model is not None:
-        if (
-            if_auto_label
-            and enable_ref
-            and ref_audio is not None
-            and ref_text.strip() == ""
-        ):
-            data, sample_rate = librosa.load(ref_audio)
-            res = batch_asr(asr_model, [data], sample_rate)[0]
-            ref_text = res["text"]
-    else:
-        gr.Warning("Whisper model not loaded!")
-    return gr.Textbox(value=ref_text)
 def build_app():
     with gr.Blocks(theme=gr.themes.Base()) as app:
@@ -377,202 +335,185 @@ def build_app():
         with gr.Row():
             with gr.Column(scale=3):
                 text = gr.Textbox(
-                    label="Input Text", placeholder=TEXTBOX_PLACEHOLDER, lines=10
                 )
                 refined_text = gr.Textbox(
-                    label="Realtime Transform Text",
-                    placeholder=
-                        "Normalization Result Preview (Currently Only Chinese)",
                     lines=5,
                     interactive=False,
                 )
                 with gr.Row():
-                    if_refine_text = gr.Checkbox(
-                        label="Text Normalization (ZH)",
-                        value=False,
-                        scale=1,
-                    )
-                    if_load_asr_model = gr.Checkbox(
-                        label="Load / Unload ASR model for auto-reranking",
                         value=False,
-                        scale=3,
                     )
                 with gr.Row():
-                    with gr.Tab(label="Advanced Config"):
-                        chunk_length = gr.Slider(
-                            label="Iterative Prompt Length, 0 means off",
-                            minimum=0,
-                            maximum=500,
-                            value=200,
-                            step=8,
-                        )
-                        max_new_tokens = gr.Slider(
-                            label="Maximum tokens per batch, 0 means no limit",
-                            minimum=0,
-                            maximum=2048,
-                            value=0,  # 0 means no limit
-                            step=8,
-                        )
-                        top_p = gr.Slider(
-                            label="Top-P",
-                            minimum=0.6,
-                            maximum=0.9,
-                            value=0.7,
-                            step=0.01,
-                        )
-                        repetition_penalty = gr.Slider(
-                            label="Repetition Penalty",
-                            minimum=1,
-                            maximum=1.5,
-                            value=1.2,
-                            step=0.01,
-                        )
-                        temperature = gr.Slider(
-                            label="Temperature",
-                            minimum=0.6,
-                            maximum=0.9,
-                            value=0.7,
-                            step=0.01,
-                        )
-                with gr.Tab(label="Reference Audio"):
-                    gr.Markdown(
-                        "5 to 10 seconds of reference audio, useful for specifying speaker."
-                    )
-                    enable_reference_audio = gr.Checkbox(
-                        label="Enable Reference Audio",
-                    )
-                    # Add dropdown for selecting example audio files
-                    example_audio_files = [f for f in os.listdir("examples") if f.endswith(".wav")]
-                    example_audio_dropdown = gr.Dropdown(
-                        label="Select Example Audio",
-                        choices=[""] + example_audio_files,
-                        value=""
-                    )
-                    reference_audio = gr.Audio(
-                        label="Reference Audio",
-                        type="filepath",
-                    )
-                    with gr.Row():
-                        if_auto_label = gr.Checkbox(
-                            label="Auto Labeling",
-                            min_width=100,
-                            scale=0,
-                            value=False,
-                        )
-                        reference_text = gr.Textbox(
-                            label="Reference Text",
-                            lines=1,
-                            placeholder="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
-                            value="",
-                        )
-                    with gr.Tab(label="Batch Inference"):
-                        batch_infer_num = gr.Slider(
-                            label="Batch infer nums",
-                            minimum=1,
-                            maximum=n_audios,
-                            step=1,
-                            value=1,
-                        )
             with gr.Column(scale=3):
-                for _ in range(n_audios):
-                    with gr.Row():
-                        error = gr.HTML(
-                            label="Error Message",
-                            visible=True if _ == 0 else False,
-                        )
-                        global_error_list.append(error)
-                    with gr.Row():
-                        audio = gr.Audio(
-                            label="Generated Audio",
-                            type="numpy",
-                            interactive=False,
-                            visible=True if _ == 0 else False,
-                        )
-                        global_audio_list.append(audio)
                 with gr.Row():
-                    stream_audio = gr.Audio(
-                        label="Streaming Audio",
-                        streaming=True,
-                        autoplay=True,
                         interactive=False,
-                        show_download_button=True,
                     )
                 with gr.Row():
                     with gr.Column(scale=3):
                         generate = gr.Button(
-                            value="\U0001F3A7 " + "Generate", variant="primary"
-                        )
-                        generate_stream = gr.Button(
-                            value="\U0001F3A7 " + "Streaming Generate",
-                            variant="primary",
                         )
         text.input(
-            fn=normalize_text, inputs=[text, if_refine_text], outputs=[refined_text]
         )
-        if_load_asr_model.change(
-            fn=change_if_load_asr_model,
-            inputs=[if_load_asr_model],
-            outputs=[if_load_asr_model],
-        )
-        if_auto_label.change(
-            fn=lambda: gr.Textbox(value=""),
-            inputs=[],
-            outputs=[reference_text],
-        ).then(
-            fn=change_if_auto_label,
-            inputs=[
-                if_load_asr_model,
-                if_auto_label,
-                enable_reference_audio,
-                reference_audio,
-                reference_text,
-            ],
-            outputs=[reference_text],
-        )
-        def select_example_audio(audio_file):
-            if audio_file:
-                audio_path = os.path.join("examples", audio_file)
-                lab_file = os.path.splitext(audio_file)[0] + ".lab"
-                lab_path = os.path.join("examples", lab_file)
-                if os.path.exists(lab_path):
-                    with open(lab_path, "r", encoding="utf-8") as f:
-                        lab_content = f.read().strip()
-                else:
-                    lab_content = ""
-                return audio_path, lab_content, True
-            return None, "", False
-        # Connect the dropdown to update reference audio and text
-        example_audio_dropdown.change(
-            fn=select_example_audio,
-            inputs=[example_audio_dropdown],
-            outputs=[reference_audio, reference_text, enable_reference_audio]
-        )
-        # # Submit
         generate.click(
             inference_wrapper,
             [
                 refined_text,
-                enable_reference_audio,
                 reference_audio,
                 reference_text,
                 max_new_tokens,
@@ -580,26 +521,28 @@ def build_app():
                 top_p,
                 repetition_penalty,
                 temperature,
-                batch_infer_num,
-                if_load_asr_model,
             ],
-            [stream_audio, *global_audio_list, *global_error_list],
             concurrency_limit=1,
         )
     return app
 def parse_args():
     parser = ArgumentParser()
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
-        default="checkpoints/fish-speech-1.4",
     )
     parser.add_argument(
         "--decoder-checkpoint-path",
         type=Path,
-        default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
     )
     parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
     parser.add_argument("--device", type=str, default="cuda")
@@ -634,17 +577,20 @@ if __name__ == "__main__":
     # Dry run to check if the model is loaded correctly and avoid the first-time latency
     list(
-        inference(
-            text="Hello, world!",
-            enable_reference_audio=False,
-            reference_audio=None,
-            reference_text="",
-            max_new_tokens=0,
-            chunk_length=200,
-            top_p=0.7,
-            repetition_penalty=1.2,
-            temperature=0.7,
-        )
     )
     logger.info("Warming up done, launching the web UI...")

 # Download if not exists
 os.makedirs("checkpoints", exist_ok=True)
+snapshot_download(repo_id="fishaudio/fish-speech-1.5", local_dir="./checkpoints/fish-speech-1.5")
 print("All checkpoints downloaded")
 from loguru import logger
 from transformers import AutoTokenizer
+from fish_speech.i18n import i18n
 from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
+from fish_speech.utils import autocast_exclude_mps, set_seed
 from tools.api import decode_vq_tokens, encode_reference
+from tools.file import AUDIO_EXTENSIONS, list_files
 from tools.llama.generate import (
     GenerateRequest,
     GenerateResponse,
 )
 from tools.vqgan.inference import load_model as load_decoder_model
+from tools.schema import (
+    GLOBAL_NUM_SAMPLES,
+    ASRPackRequest,
+    ServeASRRequest,
+    ServeASRResponse,
+    ServeASRSegment,
+    ServeAudioPart,
+    ServeForwardMessage,
+    ServeMessage,
+    ServeRequest,
+    ServeResponse,
+    ServeStreamDelta,
+    ServeStreamResponse,
+    ServeTextPart,
+    ServeTimedASRResponse,
+    ServeTTSRequest,
+    ServeVQGANDecodeRequest,
+    ServeVQGANDecodeResponse,
+    ServeVQGANEncodeRequest,
+    ServeVQGANEncodeResponse,
+    ServeVQPart,
+    ServeReferenceAudio
+)
 # Make einx happy
 os.environ["EINX_FILTER_TRACEBACK"] = "false"
 HEADER_MD = """# Fish Speech
+## The demo in this space is version 1.5, Please check [Fish Audio](https://fish.audio) for the best model.
+## 该 Demo 为 Fish Speech 1.5 版本, 请在 [Fish Audio](https://fish.audio) 体验最新 DEMO.
 A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).
 由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.
+You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).
+你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1.5) 找到模型.
 Related code and weights are released under CC BY-NC-SA 4.0 License.
 相关代码，权重使用 CC BY-NC-SA 4.0 许可证发布.
 We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.
 我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.
+The model running in this WebUI is Fish Speech V1.5 Medium.
+在此 WebUI 中运行的模型是 Fish Speech V1.5 Medium.
 """
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
 @GPU_DECORATOR
 @torch.inference_mode()
+def inference(req: ServeTTSRequest):
+    global prompt_tokens, prompt_texts
+    idstr: str | None = req.reference_id
+    if idstr is not None:
+        ref_folder = Path("references") / idstr
+        ref_folder.mkdir(parents=True, exist_ok=True)
+        ref_audios = list_files(
+            ref_folder, AUDIO_EXTENSIONS, recursive=True, sort=False
         )
+        if req.use_memory_cache == "never" or (
+            req.use_memory_cache == "on-demand" and len(prompt_tokens) == 0
+        ):
+            prompt_tokens = [
+                encode_reference(
+                    decoder_model=decoder_model,
+                    reference_audio=audio_to_bytes(str(ref_audio)),
+                    enable_reference_audio=True,
+                )
+                for ref_audio in ref_audios
+            ]
+            prompt_texts = [
+                read_ref_text(str(ref_audio.with_suffix(".lab")))
+                for ref_audio in ref_audios
+            ]
+        else:
+            logger.info("Use same references")
+    else:
+        # Parse reference audio aka prompt
+        refs = req.references
+        if req.use_memory_cache == "never" or (
+            req.use_memory_cache == "on-demand" and len(prompt_tokens) == 0
+        ):
+            prompt_tokens = [
+                encode_reference(
+                    decoder_model=decoder_model,
+                    reference_audio=ref.audio,
+                    enable_reference_audio=True,
+                )
+                for ref in refs
+            ]
+            prompt_texts = [ref.text for ref in refs]
+        else:
+            logger.info("Use same references")
+    if req.seed is not None:
+        set_seed(req.seed)
+        logger.warning(f"set seed: {req.seed}")
     # LLAMA Inference
     request = dict(
         device=decoder_model.device,
+        max_new_tokens=req.max_new_tokens,
+        text=(
+            req.text
+            if not req.normalize
+            else ChnNormedText(raw_text=req.text).normalize()
+        ),
+        top_p=req.top_p,
+        repetition_penalty=req.repetition_penalty,
+        temperature=req.temperature,
         compile=args.compile,
+        iterative_prompt=req.chunk_length > 0,
+        chunk_length=req.chunk_length,
+        max_length=4096,
+        prompt_tokens=prompt_tokens,
+        prompt_text=prompt_texts,
     )
     response_queue = queue.Queue()
     while True:
         result: WrappedGenerateResponse = response_queue.get()
         if result.status == "error":
+            yield None, None, build_html_error_message(result.response)
+            break
         result: GenerateResponse = result.response
         if result.action == "next":
             break
+        with autocast_exclude_mps(
+            device_type=decoder_model.device.type, dtype=args.precision
         ):
             fake_audios = decode_vq_tokens(
                 decoder_model=decoder_model,
             None,
             None,
             build_html_error_message(
+                i18n("No audio generated, please check the input text.")
             ),
         )
+    # No matter streaming or not, we need to return the final audio
     audio = np.concatenate(segments, axis=0)
+    yield None, (decoder_model.spec_transform.sample_rate, audio), None
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
         gc.collect()
 n_audios = 4
 global_audio_list = []
 global_error_list = []
 def inference_wrapper(
     text,
     enable_reference_audio,
     top_p,
     repetition_penalty,
     temperature,
+    seed,
     batch_infer_num,
 ):
     audios = []
     errors = []
     for _ in range(batch_infer_num):
+        result = inference(
             text,
             enable_reference_audio,
             reference_audio,
             top_p,
             repetition_penalty,
             temperature,
+            seed,
         )
+        _, audio_data, error_message = next(result)
         audios.append(
             gr.Audio(value=audio_data if audio_data else None, visible=True),
     buffer.close()
     return wav_header_bytes
 def normalize_text(user_input, use_normalization):
     if use_normalization:
         return ChnNormedText(raw_text=user_input).normalize()
     else:
         return user_input
+def update_examples():
+    examples_dir = Path("references")
+    examples_dir.mkdir(parents=True, exist_ok=True)
+    example_audios = list_files(examples_dir, AUDIO_EXTENSIONS, recursive=True)
+    return gr.Dropdown(choices=example_audios + [""])
 def build_app():
     with gr.Blocks(theme=gr.themes.Base()) as app:
         with gr.Row():
             with gr.Column(scale=3):
                 text = gr.Textbox(
+                    label=i18n("Input Text"), placeholder=TEXTBOX_PLACEHOLDER, lines=10
                 )
                 refined_text = gr.Textbox(
+                    label=i18n("Realtime Transform Text"),
+                    placeholder=i18n(
+                        "Normalization Result Preview (Currently Only Chinese)"
+                    ),
                     lines=5,
                     interactive=False,
                 )
                 with gr.Row():
+                    normalize = gr.Checkbox(
+                        label=i18n("Text Normalization"),
                         value=False,
                     )
                 with gr.Row():
+                    with gr.Column():
+                        with gr.Tab(label=i18n("Advanced Config")):
+                            with gr.Row():
+                                chunk_length = gr.Slider(
+                                    label=i18n("Iterative Prompt Length, 0 means off"),
+                                    minimum=0,
+                                    maximum=300,
+                                    value=200,
+                                    step=8,
+                                )
+                                max_new_tokens = gr.Slider(
+                                    label=i18n(
+                                        "Maximum tokens per batch, 0 means no limit"
+                                    ),
+                                    minimum=0,
+                                    maximum=2048,
+                                    value=0,
+                                    step=8,
+                                )
+                            with gr.Row():
+                                top_p = gr.Slider(
+                                    label="Top-P",
+                                    minimum=0.6,
+                                    maximum=0.9,
+                                    value=0.7,
+                                    step=0.01,
+                                )
+                                repetition_penalty = gr.Slider(
+                                    label=i18n("Repetition Penalty"),
+                                    minimum=1,
+                                    maximum=1.5,
+                                    value=1.2,
+                                    step=0.01,
+                                )
+                            with gr.Row():
+                                temperature = gr.Slider(
+                                    label="Temperature",
+                                    minimum=0.6,
+                                    maximum=0.9,
+                                    value=0.7,
+                                    step=0.01,
+                                )
+                                seed = gr.Number(
+                                    label="Seed",
+                                    info="0 means randomized inference, otherwise deterministic",
+                                    value=0,
+                                )
+                        with gr.Tab(label=i18n("Reference Audio")):
+                            with gr.Row():
+                                gr.Markdown(
+                                    i18n(
+                                        "5 to 10 seconds of reference audio, useful for specifying speaker."
+                                    )
+                                )
+                            with gr.Row():
+                                reference_id = gr.Textbox(
+                                    label=i18n("Reference ID"),
+                                    placeholder="Leave empty to use uploaded references",
+                                )
+                            with gr.Row():
+                                use_memory_cache = gr.Radio(
+                                    label=i18n("Use Memory Cache"),
+                                    choices=["never", "on-demand", "always"],
+                                    value="on-demand",
+                                )
+                            with gr.Row():
+                                reference_audio = gr.Audio(
+                                    label=i18n("Reference Audio"),
+                                    type="filepath",
+                                )
+                            with gr.Row():
+                                reference_text = gr.Textbox(
+                                    label=i18n("Reference Text"),
+                                    lines=1,
+                                    placeholder="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
+                                    value="",
+                                )
             with gr.Column(scale=3):
                 with gr.Row():
+                    error = gr.HTML(
+                        label=i18n("Error Message"),
+                        visible=True,
+                    )
+                with gr.Row():
+                    audio = gr.Audio(
+                        label=i18n("Generated Audio"),
+                        type="numpy",
                         interactive=False,
+                        visible=True,
                     )
                 with gr.Row():
                     with gr.Column(scale=3):
                         generate = gr.Button(
+                            value="\U0001F3A7 " + i18n("Generate"), variant="primary"
                         )
         text.input(
+            fn=normalize_text, inputs=[text, normalize], outputs=[refined_text]
         )
+        def inference_wrapper(
+            text,
+            normalize,
+            reference_id,
+            reference_audio,
+            reference_text,
+            max_new_tokens,
+            chunk_length,
+            top_p,
+            repetition_penalty,
+            temperature,
+            seed,
+            use_memory_cache,
+        ):
+            references = []
+            if reference_audio:
+                # 将文件路径转换为字节
+                with open(reference_audio, 'rb') as audio_file:
+                    audio_bytes = audio_file.read()
+                references = [
+                    ServeReferenceAudio(audio=audio_bytes, text=reference_text)
+                ]
+            req = ServeTTSRequest(
+                text=text,
+                normalize=normalize,
+                reference_id=reference_id if reference_id else None,
+                references=references,
+                max_new_tokens=max_new_tokens,
+                chunk_length=chunk_length,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                temperature=temperature,
+                seed=int(seed) if seed else None,
+                use_memory_cache=use_memory_cache,
+            )
+            for result in inference(req):
+                if result[2]:  # Error message
+                    return None, result[2]
+                elif result[1]:  # Audio data
+                    return result[1], None
+            return None, i18n("No audio generated")
+        # Submit
         generate.click(
             inference_wrapper,
             [
                 refined_text,
+                normalize,
+                reference_id,
                 reference_audio,
                 reference_text,
                 max_new_tokens,
                 top_p,
                 repetition_penalty,
                 temperature,
+                seed,
+                use_memory_cache,
             ],
+            [audio, error],
             concurrency_limit=1,
         )
     return app
 def parse_args():
     parser = ArgumentParser()
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
+        default="checkpoints/fish-speech-1.5",
     )
     parser.add_argument(
         "--decoder-checkpoint-path",
         type=Path,
+        default="checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
     )
     parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
     parser.add_argument("--device", type=str, default="cuda")
     # Dry run to check if the model is loaded correctly and avoid the first-time latency
     list(
+            inference(
+                ServeTTSRequest(
+                    text="Hello world.",
+                    references=[],
+                    reference_id=None,
+                    max_new_tokens=0,
+                    chunk_length=200,
+                    top_p=0.7,
+                    repetition_penalty=1.5,
+                    temperature=0.7,
+                    emotion=None,
+                    format="wav",
+                )
+            )
     )
     logger.info("Warming up done, launching the web UI...")

fish_speech/callbacks/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .grad_norm import GradNormMonitor
-__all__ = ["GradNormMonitor"]


1	+ from .grad_norm import GradNormMonitor
2	+
3	+ __all__ = ["GradNormMonitor"]

fish_speech/callbacks/grad_norm.py CHANGED Viewed

@@ -1,113 +1,113 @@
-from typing import Optional, Union
-import lightning.pytorch as pl
-import torch
-from lightning import LightningModule, Trainer
-from lightning.pytorch.callbacks import Callback
-from torch import Tensor, nn
-from torch.utils._foreach_utils import (
-    _group_tensors_by_device_and_dtype,
-    _has_foreach_support,
-)
-@torch.no_grad()
-def grad_norm(
-    parameters: Union[Tensor, list[Tensor]],
-    norm_type: float = 2.0,
-) -> float:
-    """
-    Returns the norm of the gradients of the given parameters.
-    Args:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized
-        norm_type (float): type of the used p-norm.
-    Returns:
-        Total norm of the parameter gradients (viewed as a single vector).
-    """  # noqa: E501
-    if isinstance(parameters, Tensor):
-        parameters = [parameters]
-    grads = [p.grad for p in parameters if p.grad is not None]
-    if len(grads) == 0:
-        return None
-    first_device = grads[0].device
-    grouped_grads: dict[
-        tuple[torch.device, torch.dtype], list[list[Tensor]]
-    ] = _group_tensors_by_device_and_dtype(
-        [[g.detach() for g in grads]]
-    )  # type: ignore[assignment]
-    norms = []
-    for (device, _), ([grads], _) in grouped_grads.items():
-        if _has_foreach_support(grads, device=device):
-            norms.extend(torch._foreach_norm(grads, norm_type))
-        else:
-            norms.extend([torch.norm(g, norm_type) for g in grads])
-    return torch.norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type)
-class GradNormMonitor(Callback):
-    """
-    Callback that computes the gradient norm of the model parameters.
-    """
-    def __init__(
-        self,
-        norm_type: float = 2.0,
-        logging_interval: str = "step",
-        sub_module: Optional[Union[str, list[str]]] = None,
-    ) -> None:
-        """
-        Args:
-            norm_type (float): type of the used p-norm.
-            logging_interval (str): "step" or "epoch".
-        """
-        super().__init__()
-        self.norm_type = norm_type
-        self.logging_interval = logging_interval
-        self.sub_module = sub_module
-    def on_after_backward(self, trainer: Trainer, model: LightningModule) -> None:
-        """
-        Computes the gradient norm of the model parameters and logs it to the logger.
-        Args:
-            trainer (Trainer): The trainer object
-            model (LightningModule): The current lightningModule
-        """
-        lightning_model = model
-        if self.sub_module is None:
-            return self.log_sub_module_grad_norm(lightning_model, model, "")
-        sub_modules = self.sub_module
-        if isinstance(sub_modules, str):
-            sub_modules = [sub_modules]
-        for sub_module in sub_modules:
-            self.log_sub_module_grad_norm(
-                lightning_model, getattr(model, sub_module), f"/{sub_module}"
-            )
-    def log_sub_module_grad_norm(
-        self, lightning_model: LightningModule, model: nn.Module, path: str
-    ) -> None:
-        grad_norm_val = grad_norm(model.parameters(), self.norm_type)
-        if grad_norm_val is None:
-            return
-        on_step = self.logging_interval == "step"
-        lightning_model.log(
-            f"train{path}/grad_norm",
-            grad_norm_val,
-            on_step=on_step,
-            on_epoch=not on_step,
-        )

+from typing import Optional, Union
+import lightning.pytorch as pl
+import torch
+from lightning import LightningModule, Trainer
+from lightning.pytorch.callbacks import Callback
+from torch import Tensor, nn
+from torch.utils._foreach_utils import (
+    _group_tensors_by_device_and_dtype,
+    _has_foreach_support,
+)
+@torch.no_grad()
+def grad_norm(
+    parameters: Union[Tensor, list[Tensor]],
+    norm_type: float = 2.0,
+) -> float:
+    """
+    Returns the norm of the gradients of the given parameters.
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        norm_type (float): type of the used p-norm.
+    Returns:
+        Total norm of the parameter gradients (viewed as a single vector).
+    """  # noqa: E501
+    if isinstance(parameters, Tensor):
+        parameters = [parameters]
+    grads = [p.grad for p in parameters if p.grad is not None]
+    if len(grads) == 0:
+        return None
+    first_device = grads[0].device
+    grouped_grads: dict[
+        tuple[torch.device, torch.dtype], list[list[Tensor]]
+    ] = _group_tensors_by_device_and_dtype(
+        [[g.detach() for g in grads]]
+    )  # type: ignore[assignment]
+    norms = []
+    for (device, _), ([grads], _) in grouped_grads.items():
+        if _has_foreach_support(grads, device=device):
+            norms.extend(torch._foreach_norm(grads, norm_type))
+        else:
+            norms.extend([torch.norm(g, norm_type) for g in grads])
+    return torch.norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type)
+class GradNormMonitor(Callback):
+    """
+    Callback that computes the gradient norm of the model parameters.
+    """
+    def __init__(
+        self,
+        norm_type: float = 2.0,
+        logging_interval: str = "step",
+        sub_module: Optional[Union[str, list[str]]] = None,
+    ) -> None:
+        """
+        Args:
+            norm_type (float): type of the used p-norm.
+            logging_interval (str): "step" or "epoch".
+        """
+        super().__init__()
+        self.norm_type = norm_type
+        self.logging_interval = logging_interval
+        self.sub_module = sub_module
+    def on_after_backward(self, trainer: Trainer, model: LightningModule) -> None:
+        """
+        Computes the gradient norm of the model parameters and logs it to the logger.
+        Args:
+            trainer (Trainer): The trainer object
+            model (LightningModule): The current lightningModule
+        """
+        lightning_model = model
+        if self.sub_module is None:
+            return self.log_sub_module_grad_norm(lightning_model, model, "")
+        sub_modules = self.sub_module
+        if isinstance(sub_modules, str):
+            sub_modules = [sub_modules]
+        for sub_module in sub_modules:
+            self.log_sub_module_grad_norm(
+                lightning_model, getattr(model, sub_module), f"/{sub_module}"
+            )
+    def log_sub_module_grad_norm(
+        self, lightning_model: LightningModule, model: nn.Module, path: str
+    ) -> None:
+        grad_norm_val = grad_norm(model.parameters(), self.norm_type)
+        if grad_norm_val is None:
+            return
+        on_step = self.logging_interval == "step"
+        lightning_model.log(
+            f"train{path}/grad_norm",
+            grad_norm_val,
+            on_step=on_step,
+            on_epoch=not on_step,
+        )

fish_speech/configs/base.yaml CHANGED Viewed

@@ -1,87 +1,87 @@
-# Base configuration for training a model
-paths:
-  run_dir: results/${project}
-  ckpt_dir: ${paths.run_dir}/checkpoints
-hydra:
-  run:
-    dir: ${paths.run_dir}
-# Lightning Trainer
-trainer:
-  _target_: lightning.pytorch.trainer.Trainer
-  default_root_dir: ${paths.run_dir}
-  accelerator: gpu
-  num_nodes: 1
-  devices: auto
-  strategy:
-    _target_: lightning.pytorch.strategies.DDPStrategy
-    process_group_backend: nccl  # This should be override when training on windows
-  precision: bf16-mixed
-  # disable validation by epoch end
-  check_val_every_n_epoch: null
-  val_check_interval: 5000
-  max_steps: 100_000
-  # Use torch.backends.cudnn.benchmark to speed up training
-  benchmark: true
-# Callbacks
-callbacks:
-  model_checkpoint:
-    _target_: lightning.pytorch.callbacks.ModelCheckpoint
-    dirpath: ${paths.ckpt_dir}
-    filename: "step_{step:09d}"
-    save_last: false # additionally always save an exact copy of the last checkpoint to a file last.ckpt
-    save_top_k: 5 # save 5 latest checkpoints
-    monitor: step # use step to monitor checkpoints
-    mode: max # save the latest checkpoint with the highest global_step
-    every_n_epochs: null # don't save checkpoints by epoch end
-    every_n_train_steps: 5000 # save checkpoints every 5000 steps
-    auto_insert_metric_name: false
-  model_summary:
-    _target_: lightning.pytorch.callbacks.ModelSummary
-    max_depth: 2 # the maximum depth of layer nesting that the summary will include
-  learning_rate_monitor:
-    _target_: lightning.pytorch.callbacks.LearningRateMonitor
-    logging_interval: step
-    log_momentum: false
-  grad_norm_monitor:
-    _target_: fish_speech.callbacks.GradNormMonitor
-    norm_type: 2
-    logging_interval: step
-# Logger
-logger:
-  tensorboard:
-    _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
-    save_dir: "${paths.run_dir}/tensorboard/"
-    name: null
-    log_graph: false
-    default_hp_metric: true
-    prefix: ""
-  # wandb:
-  #   _target_: lightning.pytorch.loggers.wandb.WandbLogger
-  #   # name: "" # name of the run (normally generated by wandb)
-  #   save_dir: "${paths.run_dir}"
-  #   offline: False
-  #   id: null # pass correct id to resume experiment!
-  #   anonymous: null # enable anonymous logging
-  #   project: "fish-speech"
-  #   log_model: False # upload lightning ckpts
-  #   prefix: "" # a string to put at the beginning of metric keys
-  #   # entity: "" # set to name of your wandb team
-  #   group: ""
-  #   tags: ["vq", "hq", "finetune"]
-  #   job_type: ""
-# Loop
-train: true
-test: false

+# Base configuration for training a model
+paths:
+  run_dir: results/${project}
+  ckpt_dir: ${paths.run_dir}/checkpoints
+hydra:
+  run:
+    dir: ${paths.run_dir}
+# Lightning Trainer
+trainer:
+  _target_: lightning.pytorch.trainer.Trainer
+  default_root_dir: ${paths.run_dir}
+  accelerator: gpu
+  num_nodes: 1
+  devices: auto
+  strategy:
+    _target_: lightning.pytorch.strategies.DDPStrategy
+    process_group_backend: nccl  # This should be override when training on windows
+  precision: bf16-mixed
+  # disable validation by epoch end
+  check_val_every_n_epoch: null
+  val_check_interval: 5000
+  max_steps: 100_000
+  # Use torch.backends.cudnn.benchmark to speed up training
+  benchmark: true
+# Callbacks
+callbacks:
+  model_checkpoint:
+    _target_: lightning.pytorch.callbacks.ModelCheckpoint
+    dirpath: ${paths.ckpt_dir}
+    filename: "step_{step:09d}"
+    save_last: false # additionally always save an exact copy of the last checkpoint to a file last.ckpt
+    save_top_k: 5 # save 5 latest checkpoints
+    monitor: step # use step to monitor checkpoints
+    mode: max # save the latest checkpoint with the highest global_step
+    every_n_epochs: null # don't save checkpoints by epoch end
+    every_n_train_steps: 5000 # save checkpoints every 5000 steps
+    auto_insert_metric_name: false
+  model_summary:
+    _target_: lightning.pytorch.callbacks.ModelSummary
+    max_depth: 2 # the maximum depth of layer nesting that the summary will include
+  learning_rate_monitor:
+    _target_: lightning.pytorch.callbacks.LearningRateMonitor
+    logging_interval: step
+    log_momentum: false
+  grad_norm_monitor:
+    _target_: fish_speech.callbacks.GradNormMonitor
+    norm_type: 2
+    logging_interval: step
+# Logger
+logger:
+  tensorboard:
+    _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
+    save_dir: "${paths.run_dir}/tensorboard/"
+    name: null
+    log_graph: false
+    default_hp_metric: true
+    prefix: ""
+  # wandb:
+  #   _target_: lightning.pytorch.loggers.wandb.WandbLogger
+  #   # name: "" # name of the run (normally generated by wandb)
+  #   save_dir: "${paths.run_dir}"
+  #   offline: False
+  #   id: null # pass correct id to resume experiment!
+  #   anonymous: null # enable anonymous logging
+  #   project: "fish-speech"
+  #   log_model: False # upload lightning ckpts
+  #   prefix: "" # a string to put at the beginning of metric keys
+  #   # entity: "" # set to name of your wandb team
+  #   group: ""
+  #   tags: ["vq", "hq", "finetune"]
+  #   job_type: ""
+# Loop
+train: true
+test: false

fish_speech/configs/firefly_gan_vq.yaml CHANGED Viewed

@@ -1,33 +1,33 @@
-_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
-spec_transform:
-  _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
-  sample_rate: 44100
-  n_mels: 160
-  n_fft: 2048
-  hop_length: 512
-  win_length: 2048
-backbone:
-  _target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
-  input_channels: 160
-  depths: [3, 3, 9, 3]
-  dims: [128, 256, 384, 512]
-  drop_path_rate: 0.2
-  kernel_size: 7
-head:
-  _target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
-  hop_length: 512
-  upsample_rates: [8, 8, 2, 2, 2]  # aka. strides
-  upsample_kernel_sizes: [16, 16, 4, 4, 4]
-  resblock_kernel_sizes: [3, 7, 11]
-  resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-  num_mels: 512
-  upsample_initial_channel: 512
-  pre_conv_kernel_size: 13
-  post_conv_kernel_size: 13
-quantizer:
-  _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
-  input_dim: 512
-  n_groups: 8
-  n_codebooks: 1
-  levels: [8, 5, 5, 5]
-  downsample_factor: [2, 2]

+_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
+spec_transform:
+  _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
+  sample_rate: 44100
+  n_mels: 160
+  n_fft: 2048
+  hop_length: 512
+  win_length: 2048
+backbone:
+  _target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
+  input_channels: 160
+  depths: [3, 3, 9, 3]
+  dims: [128, 256, 384, 512]
+  drop_path_rate: 0.2
+  kernel_size: 7
+head:
+  _target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
+  hop_length: 512
+  upsample_rates: [8, 8, 2, 2, 2]  # aka. strides
+  upsample_kernel_sizes: [16, 16, 4, 4, 4]
+  resblock_kernel_sizes: [3, 7, 11]
+  resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+  num_mels: 512
+  upsample_initial_channel: 512
+  pre_conv_kernel_size: 13
+  post_conv_kernel_size: 13
+quantizer:
+  _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
+  input_dim: 512
+  n_groups: 8
+  n_codebooks: 1
+  levels: [8, 5, 5, 5]
+  downsample_factor: [2, 2]

fish_speech/configs/lora/r_8_alpha_16.yaml CHANGED Viewed

@@ -1,4 +1,4 @@
-_target_: fish_speech.models.text2semantic.lora.LoraConfig
-r: 8
-lora_alpha: 16
-lora_dropout: 0.01

+_target_: fish_speech.models.text2semantic.lora.LoraConfig
+r: 8
+lora_alpha: 16
+lora_dropout: 0.01

fish_speech/configs/model/dual_ar_2_codebook_large.yaml DELETED Viewed

@@ -1,9 +0,0 @@
-defaults:
-  - dual_ar_2_codebook_small
-  - _self_
-config:
-  n_layer: 30
-  n_fast_layer: 6
-  n_head: 24
-  dim: 1536

fish_speech/configs/model/dual_ar_2_codebook_medium.yaml DELETED Viewed

@@ -1,9 +0,0 @@
-defaults:
-  - dual_ar_2_codebook_small
-  - _self_
-config:
-  n_layer: 24
-  n_fast_layer: 6
-  n_head: 16
-  dim: 1024

fish_speech/configs/model/dual_ar_2_codebook_small.yaml DELETED Viewed

@@ -1,13 +0,0 @@
-_target_: fish_speech.models.text2semantic.llama.DualARTransformer
-config:
-  _target_: fish_speech.models.text2semantic.llama.DualARModelArgs
-  max_seq_len: ${max_length}
-  vocab_size: 264 # pad 262 to 8x
-  n_layer: 12
-  n_fast_layer: 4
-  n_head: 12
-  dim: 768
-  rope_base: 10000
-  norm_eps: 1e-5
-  num_codebooks: 2  # input/output codebook size
-  codebook_size: 1032 # codebook size 1024 + 2 special tokens

fish_speech/configs/model/naive_2_codebook_small.yaml DELETED Viewed

@@ -1,12 +0,0 @@
-_target_: fish_speech.models.text2semantic.llama.NaiveTransformer
-config:
-  _target_: fish_speech.models.text2semantic.llama.NaiveModelArgs
-  max_seq_len: ${max_length}
-  vocab_size: 36408
-  n_layer: 12
-  n_head: 12
-  dim: 768
-  rope_base: 10000
-  norm_eps: 1e-5
-  num_codebooks: 2  # input/output codebook size
-  codebook_size: 1032 # codebook size 1024 + 2 special tokens

fish_speech/configs/text2semantic_finetune.yaml CHANGED Viewed

@@ -1,83 +1,83 @@
-defaults:
-  - base
-  - _self_
-project: text2semantic_finetune_dual_ar
-max_length: 4096
-pretrained_ckpt_path: checkpoints/fish-speech-1.4
-# Lightning Trainer
-trainer:
-  accumulate_grad_batches: 1
-  gradient_clip_val: 1.0
-  gradient_clip_algorithm: "norm"
-  max_steps: 1000
-  precision: bf16-true
-  limit_val_batches: 10
-  val_check_interval: 100
-# Dataset Configuration
-tokenizer:
-  _target_: transformers.AutoTokenizer.from_pretrained
-  pretrained_model_name_or_path: ${pretrained_ckpt_path}
-# Dataset Configuration
-train_dataset:
-  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
-  proto_files:
-    - data/protos
-  tokenizer: ${tokenizer}
-  causal: true
-  max_length: ${max_length}
-  use_speaker: false
-  interactive_prob: 0.7
-val_dataset:
-  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
-  proto_files:
-    - data/protos
-  tokenizer: ${tokenizer}
-  causal: true
-  max_length: ${max_length}
-  use_speaker: false
-  interactive_prob: 0.7
-data:
-  _target_: fish_speech.datasets.semantic.SemanticDataModule
-  train_dataset: ${train_dataset}
-  val_dataset: ${val_dataset}
-  num_workers: 4
-  batch_size: 8
-  tokenizer: ${tokenizer}
-  max_length: ${max_length}
-# Model Configuration
-model:
-  _target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
-  model:
-    _target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
-    path: ${pretrained_ckpt_path}
-    load_weights: true
-    max_length: ${max_length}
-    lora_config: null
-  optimizer:
-    _target_: torch.optim.AdamW
-    _partial_: true
-    lr: 1e-4
-    weight_decay: 0
-    betas: [0.9, 0.95]
-    eps: 1e-5
-  lr_scheduler:
-    _target_: torch.optim.lr_scheduler.LambdaLR
-    _partial_: true
-    lr_lambda:
-      _target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
-      _partial_: true
-      num_warmup_steps: 10
-# Callbacks
-callbacks:
-  model_checkpoint:
-    every_n_train_steps: ${trainer.val_check_interval}

+defaults:
+  - base
+  - _self_
+project: text2semantic_finetune_dual_ar
+max_length: 4096
+pretrained_ckpt_path: checkpoints/fish-speech-1.4
+# Lightning Trainer
+trainer:
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  gradient_clip_algorithm: "norm"
+  max_steps: 1000
+  precision: bf16-true
+  limit_val_batches: 10
+  val_check_interval: 100
+# Dataset Configuration
+tokenizer:
+  _target_: transformers.AutoTokenizer.from_pretrained
+  pretrained_model_name_or_path: ${pretrained_ckpt_path}
+# Dataset Configuration
+train_dataset:
+  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
+  proto_files:
+    - data/protos
+  tokenizer: ${tokenizer}
+  causal: true
+  max_length: ${max_length}
+  use_speaker: false
+  interactive_prob: 0.7
+val_dataset:
+  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
+  proto_files:
+    - data/protos
+  tokenizer: ${tokenizer}
+  causal: true
+  max_length: ${max_length}
+  use_speaker: false
+  interactive_prob: 0.7
+data:
+  _target_: fish_speech.datasets.semantic.SemanticDataModule
+  train_dataset: ${train_dataset}
+  val_dataset: ${val_dataset}
+  num_workers: 4
+  batch_size: 8
+  tokenizer: ${tokenizer}
+  max_length: ${max_length}
+# Model Configuration
+model:
+  _target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
+  model:
+    _target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
+    path: ${pretrained_ckpt_path}
+    load_weights: true
+    max_length: ${max_length}
+    lora_config: null
+  optimizer:
+    _target_: torch.optim.AdamW
+    _partial_: true
+    lr: 1e-4
+    weight_decay: 0
+    betas: [0.9, 0.95]
+    eps: 1e-5
+  lr_scheduler:
+    _target_: torch.optim.lr_scheduler.LambdaLR
+    _partial_: true
+    lr_lambda:
+      _target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
+      _partial_: true
+      num_warmup_steps: 10
+# Callbacks
+callbacks:
+  model_checkpoint:
+    every_n_train_steps: ${trainer.val_check_interval}

fish_speech/configs/text2semantic_finetune_lora.yaml DELETED Viewed

@@ -1,13 +0,0 @@
-defaults:
-  - text2semantic_finetune
-  - _self_
-project: text2semantic_finetune_dual_ar_lora
-# Model Configuration
-model:
-  save_lora_only: true
-  lora_config:
-    _target_: fish_speech.models.text2semantic.lit_module.LoraConfig
-    r: 8
-    lora_alpha: 16

fish_speech/configs/text2semantic_pretrain.yaml DELETED Viewed

@@ -1,74 +0,0 @@
-defaults:
-  - base
-  - [email protected]: dual_ar_2_codebook_small
-  - _self_
-project: text2semantic_pretrain_dual_ar_debug
-max_length: 2048
-# Lightning Trainer
-trainer:
-  accumulate_grad_batches: 1
-  gradient_clip_val: 1.0
-  gradient_clip_algorithm: 'norm'
-  max_steps: 1_000_000
-  precision: bf16-true
-  limit_val_batches: 10
-# Dataset Configuration
-tokenizer:
-  _target_: transformers.AutoTokenizer.from_pretrained
-  pretrained_model_name_or_path: fishaudio/fish-speech-1
-# Dataset Configuration
-train_dataset:
-  _target_: fish_speech.datasets.text.AutoAugTextDataset
-  proto_files:
-    - data/protos/train
-  tokenizer: ${tokenizer}
-  max_length: ${max_length}
-  num_codebooks: ${model.model.config.num_codebooks}
-  use_speaker: false
-  interactive_prob: 0.5
-val_dataset:
-  _target_: fish_speech.datasets.text.AutoAugTextDataset
-  proto_files:
-    - data/protos/test
-  tokenizer: ${tokenizer}
-  max_length: ${max_length}
-  num_codebooks: ${model.model.config.num_codebooks}
-  use_speaker: false
-  interactive_prob: 0.5
-data:
-  _target_: fish_speech.datasets.text.TextDataModule
-  train_dataset: ${train_dataset}
-  val_dataset: ${val_dataset}
-  num_workers: 4
-  batch_size: 8
-  tokenizer: ${tokenizer}
-  max_length: ${max_length}
-# Model Configuration
-model:
-  _target_: fish_speech.models.text2semantic.TextToSemantic
-  model: {}
-  optimizer:
-    _target_: torch.optim.AdamW
-    _partial_: true
-    lr: 3e-4
-    weight_decay: 0.01
-    betas: [0.9, 0.95]
-    eps: 1e-5
-  lr_scheduler:
-    _target_: torch.optim.lr_scheduler.LambdaLR
-    _partial_: true
-    lr_lambda:
-      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
-      _partial_: true
-      num_warmup_steps: 2000
-      num_training_steps: ${trainer.max_steps}
-      final_lr_ratio: 0.1

fish_speech/configs/text2semantic_sft.yaml DELETED Viewed

@@ -1,87 +0,0 @@
-defaults:
-  - base
-  - [email protected]: dual_ar_8_codebook_small
-  - _self_
-project: text2semantic_sft_medium_dual_ar
-max_length: 4096
-ckpt_path: results/text2semantic_pretrain_medium_dual_ar/checkpoints/step_000060000.ckpt
-resume_weights_only: true
-# Lightning Trainer
-trainer:
-  accumulate_grad_batches: 1
-  gradient_clip_val: 1.0
-  gradient_clip_algorithm: 'norm'
-  max_steps: 10_000
-  precision: bf16-true
-  limit_val_batches: 10
-  val_check_interval: 500
-# Dataset Configuration
-tokenizer:
-  _target_: transformers.AutoTokenizer.from_pretrained
-  pretrained_model_name_or_path: fishaudio/speech-lm-v1
-# Dataset Configuration
-train_dataset:
-  _target_: fish_speech.datasets.text.AutoAugTextDataset
-  use_data_server: false
-  proto_files:
-    - data/protos/sft/train_Genshin.protos
-    - data/protos/sft/sft.protos
-  tokenizer: ${tokenizer}
-  max_length: ${max_length}
-  num_codebooks: ${model.model.config.num_codebooks}
-  use_speaker: false
-  phones_prob: 0.5
-  interactive_prob: 0.5
-val_dataset:
-  _target_: fish_speech.datasets.text.AutoAugTextDataset
-  use_data_server: false
-  proto_files:
-    - data/protos/sft/val_Genshin.protos
-  tokenizer: ${tokenizer}
-  max_length: ${max_length}
-  num_codebooks: ${model.model.config.num_codebooks}
-  use_speaker: false
-  phones_prob: 0.5
-  interactive_prob: 0.5
-data:
-  _target_: fish_speech.datasets.text.TextDataModule
-  train_dataset: ${train_dataset}
-  val_dataset: ${val_dataset}
-  num_workers: 4
-  batch_size: 8
-  tokenizer: ${tokenizer}
-  max_length: ${max_length}
-# Model Configuration
-model:
-  _target_: fish_speech.models.text2semantic.TextToSemantic
-  model: {}
-  optimizer:
-    _target_: torch.optim.AdamW
-    _partial_: true
-    lr: 4e-5
-    weight_decay: 0
-    betas: [0.9, 0.95]
-    eps: 1e-5
-  lr_scheduler:
-    _target_: torch.optim.lr_scheduler.LambdaLR
-    _partial_: true
-    lr_lambda:
-      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
-      _partial_: true
-      num_warmup_steps: 100
-      num_training_steps: ${trainer.max_steps}
-      final_lr_ratio: 0
-callbacks:
-  model_checkpoint:
-    every_n_train_steps: 1000
-    save_top_k: 10

fish_speech/configs/vqgan_finetune.yaml DELETED Viewed

@@ -1,135 +0,0 @@
-defaults:
-  - base
-  - _self_
-project: vq-gan-finetune
-ckpt_path: checkpoints/vq-gan-group-fsq-2x1024.pth
-resume_weights_only: true
-# Lightning Trainer
-trainer:
-  accelerator: gpu
-  devices: auto
-  precision: bf16-mixed
-  max_steps: 100_000
-  val_check_interval: 5000
-  strategy: ddp_find_unused_parameters_true
-sample_rate: 44100
-hop_length: 512
-num_mels: 128
-n_fft: 2048
-win_length: 2048
-freeze_encoder: true
-# Dataset Configuration
-train_dataset:
-  _target_: fish_speech.datasets.vqgan.VQGANDataset
-  filelist: data/filelist.train.txt
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-  slice_frames: 512
-val_dataset:
-  _target_: fish_speech.datasets.vqgan.VQGANDataset
-  filelist: data/filelist.val.txt
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-data:
-  _target_: fish_speech.datasets.vqgan.VQGANDataModule
-  train_dataset: ${train_dataset}
-  val_dataset: ${val_dataset}
-  num_workers: 4
-  batch_size: 16
-  val_batch_size: 16
-# Model Configuration
-model:
-  _target_: fish_speech.models.vqgan.VQGAN
-  sampling_rate: ${sample_rate}
-  weight_adv: 0.2
-  weight_vq: 1.0
-  weight_mel: 1.0
-  freeze_encoder: false
-  encoder:
-    _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
-    input_channels: ${num_mels}
-    residual_channels: 768
-    residual_layers: 20
-    dilation_cycle: 4
-  quantizer:
-    _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
-    input_dim: 768
-    n_codebooks: 1
-    n_groups: 2
-    levels: [8, 5, 5, 5]
-  decoder:
-    _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
-    output_channels: ${num_mels}
-    residual_channels: 768
-    residual_layers: 20
-    dilation_cycle: 4
-    condition_channels: 768
-  discriminator:
-    _target_: fish_speech.models.vqgan.modules.discriminator.Discriminator
-  vocoder:
-    _target_: fish_speech.models.vqgan.modules.firefly.FireflyBase
-    ckpt_path: null # You may download the pretrained vocoder and set the path here
-  encode_mel_transform:
-    _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
-    sample_rate: ${sample_rate}
-    n_fft: ${n_fft}
-    hop_length: ${hop_length}
-    win_length: ${win_length}
-    n_mels: ${num_mels}
-    f_min: 0.0
-    f_max: 8000.0
-  gt_mel_transform:
-    _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
-    sample_rate: ${sample_rate}
-    n_fft: ${n_fft}
-    hop_length: ${hop_length}
-    win_length: ${win_length}
-    n_mels: ${num_mels}
-  optimizer:
-    _target_: torch.optim.AdamW
-    _partial_: true
-    lr: 4e-5
-    betas: [0.8, 0.99]
-    eps: 1e-5
-    weight_decay: 0.01
-  lr_scheduler:
-    _target_: torch.optim.lr_scheduler.LambdaLR
-    _partial_: true
-    lr_lambda:
-      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
-      _partial_: true
-      num_warmup_steps: 100
-      num_training_steps: ${trainer.max_steps}
-      final_lr_ratio: 0
-callbacks:
-  model_summary:
-    _target_: lightning.pytorch.callbacks.ModelSummary
-    max_depth: 1
-  model_checkpoint:
-    every_n_train_steps: ${trainer.val_check_interval}
-  grad_norm_monitor:
-    sub_module:
-      - encoder
-      - decoder
-      - quantizer
-      - discriminator

fish_speech/configs/vqgan_pretrain.yaml DELETED Viewed

@@ -1,139 +0,0 @@
-defaults:
-  - base
-  - _self_
-project: vq-gan-pretrain
-# Lightning Trainer
-trainer:
-  accelerator: gpu
-  devices: auto
-  precision: bf16-mixed
-  max_steps: 1_000_000
-  val_check_interval: 5000
-  strategy: ddp_find_unused_parameters_true
-sample_rate: 44100
-hop_length: 512
-num_mels: 128
-n_fft: 2048
-win_length: 2048
-# Dataset Configuration
-train_dataset:
-  _target_: torch.utils.data.ConcatDataset
-  datasets:
-    - _target_: fish_speech.datasets.vqgan.VQGANDataset
-      filelist: data/gigaspeech/vq_train_filelist.txt
-      sample_rate: ${sample_rate}
-      hop_length: ${hop_length}
-      slice_frames: 512
-    - _target_: fish_speech.datasets.vqgan.VQGANDataset
-      filelist: data/sft/vq_train_filelist.txt
-      sample_rate: ${sample_rate}
-      hop_length: ${hop_length}
-      slice_frames: 512
-val_dataset:
-  _target_: fish_speech.datasets.vqgan.VQGANDataset
-  filelist: data/sft/vq_val_filelist.txt
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-data:
-  _target_: fish_speech.datasets.vqgan.VQGANDataModule
-  train_dataset: ${train_dataset}
-  val_dataset: ${val_dataset}
-  num_workers: 4
-  batch_size: 32
-  val_batch_size: 32
-# Model Configuration
-model:
-  _target_: fish_speech.models.vqgan.VQGAN
-  sampling_rate: ${sample_rate}
-  weight_adv: 0.2
-  weight_vq: 1.0
-  weight_mel: 1.0
-  freeze_encoder: false
-  encoder:
-    _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
-    input_channels: ${num_mels}
-    residual_channels: 768
-    residual_layers: 20
-    dilation_cycle: 4
-  quantizer:
-    _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
-    input_dim: 768
-    n_codebooks: 1
-    n_groups: 2
-    levels: [8, 5, 5, 5]
-  decoder:
-    _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
-    output_channels: ${num_mels}
-    residual_channels: 768
-    residual_layers: 20
-    dilation_cycle: 4
-    condition_channels: 768
-  discriminator:
-    _target_: fish_speech.models.vqgan.modules.discriminator.Discriminator
-  vocoder:
-    _target_: fish_speech.models.vqgan.modules.firefly.FireflyBase
-    ckpt_path: null # You may download the pretrained vocoder and set the path here
-  encode_mel_transform:
-    _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
-    sample_rate: ${sample_rate}
-    n_fft: ${n_fft}
-    hop_length: ${hop_length}
-    win_length: ${win_length}
-    n_mels: ${num_mels}
-    f_min: 0.0
-    f_max: 8000.0
-  gt_mel_transform:
-    _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
-    sample_rate: ${sample_rate}
-    n_fft: ${n_fft}
-    hop_length: ${hop_length}
-    win_length: ${win_length}
-    n_mels: ${num_mels}
-  optimizer:
-    _target_: torch.optim.AdamW
-    _partial_: true
-    lr: 1e-4
-    betas: [0.8, 0.99]
-    eps: 1e-5
-    weight_decay: 0.01
-  lr_scheduler:
-    _target_: torch.optim.lr_scheduler.LambdaLR
-    _partial_: true
-    lr_lambda:
-      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
-      _partial_: true
-      num_warmup_steps: 100
-      num_training_steps: ${trainer.max_steps}
-      final_lr_ratio: 0
-callbacks:
-  model_summary:
-    _target_: lightning.pytorch.callbacks.ModelSummary
-    max_depth: 1
-  model_checkpoint:
-    every_n_train_steps: ${trainer.val_check_interval}
-  grad_norm_monitor:
-    sub_module:
-      - encoder
-      - decoder
-      - quantizer
-      - discriminator

fish_speech/conversation.py CHANGED Viewed

@@ -1,2 +1,267 @@
-SEMANTIC_TOKEN = "<|semantic|>"
-CODEBOOK_PAD_TOKEN_ID = 0

+from dataclasses import dataclass, field
+from typing import Literal
+import torch
+from .tokenizer import MODALITY_TOKENS, FishTokenizer
+CODEBOOK_PAD_TOKEN_ID = 0
+@dataclass(kw_only=True)
+class BasePart:
+    pass
+@dataclass(kw_only=True)
+class VQPart(BasePart):
+    codes: torch.Tensor
+@dataclass(kw_only=True)
+class TextPart(BasePart):
+    text: str
+@dataclass(kw_only=True)
+class EncodedMessage:
+    tokens: torch.Tensor
+    labels: torch.Tensor
+    vq_mask_tokens: torch.Tensor | None = None
+    vq_mask_labels: torch.Tensor | None = None
+    vq_parts: list[torch.Tensor]
+    vq_require_losses: torch.Tensor | None = None
+@dataclass(kw_only=True)
+class Message:
+    role: Literal["system", "user", "assistant"]
+    parts: list[VQPart | TextPart] = field(default_factory=list)
+    add_im_start: bool = True
+    add_im_end: bool = True
+    cal_loss: bool = False
+    modality: Literal["text", "voice", "interleave"] | None = None
+    # By default, ignore the loss of the auto-generated im_start token
+    ignore_im_start_loss: bool = True
+    def encode(
+        self: "Message",
+        tokenizer: FishTokenizer,
+    ) -> EncodedMessage:
+        all_tokens = []
+        all_labels = []
+        # Multi-modal tokens
+        vq_parts = []
+        vq_masks = []
+        parts = self.parts.copy()
+        if self.add_im_start:
+            modality_token = MODALITY_TOKENS[self.modality] if self.modality else ""
+            parts.insert(0, TextPart(text=f"<|im_start|>{self.role}\n{modality_token}"))
+        if self.add_im_end:
+            parts.append(TextPart(text="<|im_end|>"))
+        for part in parts:
+            if isinstance(part, TextPart):
+                tokens = torch.tensor(
+                    tokenizer.encode(part.text),
+                    dtype=torch.int,
+                )
+            elif isinstance(part, VQPart):
+                curr_codes = part.codes.clone()
+                tokens = torch.tensor(
+                    [
+                        tokenizer.semantic_id_to_token_id[i.item()]
+                        for i in curr_codes[0].int()
+                    ],
+                    dtype=torch.int,
+                )
+                vq_parts.append(curr_codes)
+            else:
+                raise ValueError(f"Unsupported part type: {type(part)}")
+            all_tokens.append(tokens)
+            if isinstance(part, VQPart):
+                vq_masks.append(torch.ones_like(tokens, dtype=torch.bool))
+            else:
+                vq_masks.append(torch.zeros_like(tokens, dtype=torch.bool))
+            if self.cal_loss:
+                all_labels.append(tokens.clone())
+            else:
+                all_labels.append(torch.full_like(tokens, -100))
+        tokens = torch.cat(all_tokens, dim=0)
+        labels = torch.cat(all_labels, dim=0)
+        vq_masks = torch.cat(vq_masks, dim=0)
+        assert tokens.shape == labels.shape == vq_masks.shape
+        if self.ignore_im_start_loss and self.add_im_start:
+            labels[: len(all_tokens[0])] = -100
+        return EncodedMessage(
+            tokens=tokens,
+            labels=labels,
+            vq_parts=vq_parts,
+            vq_mask_tokens=vq_masks,
+            vq_mask_labels=vq_masks,
+        )
+@dataclass
+class Conversation:
+    messages: list[Message]
+    def __init__(self: "Conversation", messages: list[Message] | None = None):
+        self.messages = messages or []
+    def encode(
+        self: "Conversation",
+        tokenizer: FishTokenizer,
+        add_shift: bool = True,
+        ignore_loss_tokens: list[str] = [],
+    ) -> EncodedMessage:
+        # Build the input_ids and labels
+        tokens = []
+        labels = []
+        vq_parts = []
+        vq_mask_tokens = []
+        vq_mask_labels = []
+        vq_require_losses = []
+        ignore_loss_token_ids = [tokenizer.get_token_id(i) for i in ignore_loss_tokens]
+        for message in self.messages:
+            encoded = message.encode(
+                tokenizer,
+            )
+            tokens.append(encoded.tokens)
+            labels.append(encoded.labels)
+            vq_parts.extend(encoded.vq_parts)
+            vq_mask_tokens.append(encoded.vq_mask_tokens)
+            vq_mask_labels.append(encoded.vq_mask_labels)
+            vq_require_losses.extend([message.cal_loss] * len(encoded.vq_parts))
+        tokens = torch.cat(tokens, dim=0)
+        labels = torch.cat(labels, dim=0)
+        vq_mask_tokens = torch.cat(vq_mask_tokens, dim=0)
+        vq_mask_labels = torch.cat(vq_mask_labels, dim=0)
+        vq_require_losses = torch.tensor(vq_require_losses, dtype=torch.bool)
+        if add_shift:
+            tokens = tokens[:-1]
+            labels = labels[1:]
+            vq_mask_tokens = vq_mask_tokens[:-1]
+            vq_mask_labels = vq_mask_labels[1:]
+        for i in ignore_loss_token_ids:
+            assert i != -100 and i is not None
+            labels[labels == i] = -100
+        assert tokens.dtype in [
+            torch.int,
+            torch.long,
+        ], f"Invalid dtype: {tokens.dtype}, conv: {conversation}"
+        return EncodedMessage(
+            tokens=tokens,
+            labels=labels,
+            vq_parts=vq_parts,
+            vq_mask_tokens=vq_mask_tokens,
+            vq_mask_labels=vq_mask_labels,
+            vq_require_losses=vq_require_losses,
+        )
+    def encode_for_inference(
+        self: "Conversation",
+        tokenizer: FishTokenizer,
+        num_codebooks: int,
+    ) -> EncodedMessage:
+        # self.visualize(tokenizer)
+        encoded = self.encode(tokenizer, add_shift=False)
+        tokens = encoded.tokens
+        values = torch.zeros((num_codebooks + 1, len(tokens)), dtype=torch.int)
+        values[0] = tokens
+        if encoded.vq_parts is None or len(encoded.vq_parts) == 0:
+            return values
+        vq_parts = encoded.vq_parts
+        vq_parts = [part.to(values.device) for part in vq_parts]
+        vq_parts = torch.cat(vq_parts, dim=1)
+        values[0, encoded.vq_mask_tokens] = vq_parts[0] + tokenizer.semantic_begin_id
+        values[1:, encoded.vq_mask_tokens] = vq_parts
+        return values
+    def visualize(
+        self: "Conversation",
+        tokenizer: FishTokenizer,
+        ignore_loss_tokens: list[str] = [],
+    ):
+        encoded = self.encode(
+            tokenizer, add_shift=False, ignore_loss_tokens=ignore_loss_tokens
+        )
+        # Colors for alternating tokens
+        colors = {
+            "blue": "\033[94m",  # Light blue
+            "cyan": "\033[96m",  # Cyan
+            "green": "\033[92m",  # Light green
+            "dark_green": "\033[32m",  # Dark green
+        }
+        blue_idx = 0
+        green_idx = 0
+        def print_in_blue(x):
+            nonlocal blue_idx
+            color = colors["blue"] if blue_idx % 2 == 0 else colors["cyan"]
+            print(f"{color}{x}\033[0m", end="")
+            blue_idx += 1
+        def print_in_green(x):
+            nonlocal green_idx
+            color = colors["green"] if green_idx % 2 == 0 else colors["dark_green"]
+            print(f"{color}{x}\033[0m", end="")
+            green_idx += 1
+        for tok, lab in zip(encoded.tokens, encoded.labels):
+            val = tokenizer.decode([tok])
+            if lab == -100:
+                print_in_green(val)
+            else:
+                print_in_blue(val)
+        print()
+    def append(self: "Conversation", message: Message):
+        self.messages.append(message)
+if __name__ == "__main__":
+    message0 = Message(
+        role="user",
+        parts=[
+            TextPart(text="Hello, how are you?"),
+            VQPart(codes=torch.zeros((4, 10))),
+        ],
+        cal_loss=False,
+    )
+    message1 = Message(
+        role="assistant",
+        parts=[TextPart(text="I'm fine, thank you.")],
+        cal_loss=True,
+    )
+    conversation = Conversation([message0, message1])
+    tokenizer = FishTokenizer.from_pretrained("checkpoints/Qwen2-1.5B-Instruct")
+    conversation.visualize(tokenizer)
+    encoded = conversation.encode(tokenizer)
+    print(encoded)
+    print(tokenizer.batch_decode(encoded.tokens))

fish_speech/datasets/concat_repeat.py CHANGED Viewed

@@ -1,53 +1,53 @@
-import bisect
-import random
-from typing import Iterable
-from torch.utils.data import Dataset, IterableDataset
-class ConcatRepeatDataset(Dataset):
-    datasets: list[Dataset]
-    cumulative_sizes: list[int]
-    repeats: list[int]
-    @staticmethod
-    def cumsum(sequence, repeats):
-        r, s = [], 0
-        for dataset, repeat in zip(sequence, repeats):
-            l = len(dataset) * repeat
-            r.append(l + s)
-            s += l
-        return r
-    def __init__(self, datasets: Iterable[Dataset], repeats: list[int]):
-        super().__init__()
-        self.datasets = list(datasets)
-        self.repeats = repeats
-        assert len(self.datasets) > 0, "datasets should not be an empty iterable"
-        assert len(self.datasets) == len(
-            repeats
-        ), "datasets and repeats should have the same length"
-        for d in self.datasets:
-            assert not isinstance(
-                d, IterableDataset
-            ), "ConcatRepeatDataset does not support IterableDataset"
-        self.cumulative_sizes = self.cumsum(self.datasets, self.repeats)
-    def __len__(self):
-        return self.cumulative_sizes[-1]
-    def __getitem__(self, idx):
-        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
-        if dataset_idx == 0:
-            sample_idx = idx
-        else:
-            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
-        dataset = self.datasets[dataset_idx]
-        return dataset[sample_idx % len(dataset)]

+import bisect
+import random
+from typing import Iterable
+from torch.utils.data import Dataset, IterableDataset
+class ConcatRepeatDataset(Dataset):
+    datasets: list[Dataset]
+    cumulative_sizes: list[int]
+    repeats: list[int]
+    @staticmethod
+    def cumsum(sequence, repeats):
+        r, s = [], 0
+        for dataset, repeat in zip(sequence, repeats):
+            l = len(dataset) * repeat
+            r.append(l + s)
+            s += l
+        return r
+    def __init__(self, datasets: Iterable[Dataset], repeats: list[int]):
+        super().__init__()
+        self.datasets = list(datasets)
+        self.repeats = repeats
+        assert len(self.datasets) > 0, "datasets should not be an empty iterable"
+        assert len(self.datasets) == len(
+            repeats
+        ), "datasets and repeats should have the same length"
+        for d in self.datasets:
+            assert not isinstance(
+                d, IterableDataset
+            ), "ConcatRepeatDataset does not support IterableDataset"
+        self.cumulative_sizes = self.cumsum(self.datasets, self.repeats)
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+    def __getitem__(self, idx):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        dataset = self.datasets[dataset_idx]
+        return dataset[sample_idx % len(dataset)]

fish_speech/datasets/protos/text-data.proto CHANGED Viewed

@@ -1,24 +1,24 @@
-syntax = "proto3";
-package text_data;
-message Semantics {
-    repeated uint32 values = 1;
-}
-message Sentence {
-    repeated string texts = 1;
-    repeated Semantics semantics = 3;
-}
-message TextData {
-    string source = 1;
-    string name = 2;
-    repeated Sentence sentences = 4;
-}
-message SampledData {
-    string source = 1;
-    string name = 2;
-    repeated Sentence samples = 3;
-}

+syntax = "proto3";
+package text_data;
+message Semantics {
+    repeated uint32 values = 1;
+}
+message Sentence {
+    repeated string texts = 1;
+    repeated Semantics semantics = 3;
+}
+message TextData {
+    string source = 1;
+    string name = 2;
+    repeated Sentence sentences = 4;
+}
+message SampledData {
+    string source = 1;
+    string name = 2;
+    repeated Sentence samples = 3;
+}

fish_speech/datasets/protos/text_data_pb2.py CHANGED Viewed

@@ -1,33 +1,33 @@
-# -*- coding: utf-8 -*-
-# Generated by the protocol buffer compiler.  DO NOT EDIT!
-# source: text-data.proto
-# Protobuf Python Version: 4.25.1
-"""Generated protocol buffer code."""
-from google.protobuf import descriptor as _descriptor
-from google.protobuf import descriptor_pool as _descriptor_pool
-from google.protobuf import symbol_database as _symbol_database
-from google.protobuf.internal import builder as _builder
-# @@protoc_insertion_point(imports)
-_sym_db = _symbol_database.Default()
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-    b'\n\x0ftext-data.proto\x12\ttext_data"\x1b\n\tSemantics\x12\x0e\n\x06values\x18\x01 \x03(\r"B\n\x08Sentence\x12\r\n\x05texts\x18\x01 \x03(\t\x12\'\n\tsemantics\x18\x03 \x03(\x0b\x32\x14.text_data.Semantics"P\n\x08TextData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12&\n\tsentences\x18\x04 \x03(\x0b\x32\x13.text_data.Sentence"Q\n\x0bSampledData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12$\n\x07samples\x18\x03 \x03(\x0b\x32\x13.text_data.Sentenceb\x06proto3'
-)
-_globals = globals()
-_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
-_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "text_data_pb2", _globals)
-if _descriptor._USE_C_DESCRIPTORS == False:
-    DESCRIPTOR._options = None
-    _globals["_SEMANTICS"]._serialized_start = 30
-    _globals["_SEMANTICS"]._serialized_end = 57
-    _globals["_SENTENCE"]._serialized_start = 59
-    _globals["_SENTENCE"]._serialized_end = 125
-    _globals["_TEXTDATA"]._serialized_start = 127
-    _globals["_TEXTDATA"]._serialized_end = 207
-    _globals["_SAMPLEDDATA"]._serialized_start = 209
-    _globals["_SAMPLEDDATA"]._serialized_end = 290
-# @@protoc_insertion_point(module_scope)

+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: text-data.proto
+# Protobuf Python Version: 4.25.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+_sym_db = _symbol_database.Default()
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
+    b'\n\x0ftext-data.proto\x12\ttext_data"\x1b\n\tSemantics\x12\x0e\n\x06values\x18\x01 \x03(\r"B\n\x08Sentence\x12\r\n\x05texts\x18\x01 \x03(\t\x12\'\n\tsemantics\x18\x03 \x03(\x0b\x32\x14.text_data.Semantics"P\n\x08TextData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12&\n\tsentences\x18\x04 \x03(\x0b\x32\x13.text_data.Sentence"Q\n\x0bSampledData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12$\n\x07samples\x18\x03 \x03(\x0b\x32\x13.text_data.Sentenceb\x06proto3'
+)
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "text_data_pb2", _globals)
+if _descriptor._USE_C_DESCRIPTORS == False:
+    DESCRIPTOR._options = None
+    _globals["_SEMANTICS"]._serialized_start = 30
+    _globals["_SEMANTICS"]._serialized_end = 57
+    _globals["_SENTENCE"]._serialized_start = 59
+    _globals["_SENTENCE"]._serialized_end = 125
+    _globals["_TEXTDATA"]._serialized_start = 127
+    _globals["_TEXTDATA"]._serialized_end = 207
+    _globals["_SAMPLEDDATA"]._serialized_start = 209
+    _globals["_SAMPLEDDATA"]._serialized_end = 290
+# @@protoc_insertion_point(module_scope)

fish_speech/datasets/protos/text_data_stream.py CHANGED Viewed

@@ -1,36 +1,36 @@
-import struct
-from .text_data_pb2 import TextData
-def read_pb_stream(f):
-    while True:
-        buf = f.read(4)
-        if len(buf) == 0:
-            break
-        size = struct.unpack("I", buf)[0]
-        buf = f.read(size)
-        text_data = TextData()
-        text_data.ParseFromString(buf)
-        yield text_data
-def write_pb_stream(f, text_data):
-    buf = text_data.SerializeToString()
-    f.write(struct.pack("I", len(buf)))
-    f.write(buf)
-def pack_pb_stream(text_data):
-    buf = text_data.SerializeToString()
-    return struct.pack("I", len(buf)) + buf
-def split_pb_stream(f):
-    while True:
-        head = f.read(4)
-        if len(head) == 0:
-            break
-        size = struct.unpack("I", head)[0]
-        buf = f.read(size)
-        yield head + buf

+import struct
+from .text_data_pb2 import TextData
+def read_pb_stream(f):
+    while True:
+        buf = f.read(4)
+        if len(buf) == 0:
+            break
+        size = struct.unpack("I", buf)[0]
+        buf = f.read(size)
+        text_data = TextData()
+        text_data.ParseFromString(buf)
+        yield text_data
+def write_pb_stream(f, text_data):
+    buf = text_data.SerializeToString()
+    f.write(struct.pack("I", len(buf)))
+    f.write(buf)
+def pack_pb_stream(text_data):
+    buf = text_data.SerializeToString()
+    return struct.pack("I", len(buf)) + buf
+def split_pb_stream(f):
+    while True:
+        head = f.read(4)
+        if len(head) == 0:
+            break
+        size = struct.unpack("I", head)[0]
+        buf = f.read(size)
+        yield head + buf

fish_speech/datasets/semantic.py CHANGED Viewed

@@ -1,496 +1,496 @@
-import random
-from dataclasses import dataclass
-from itertools import chain
-from pathlib import Path
-from random import Random
-from typing import Optional, Union
-import numpy as np
-import pyarrow.parquet as pq
-import torch
-import torch.nn.functional as F
-from datasets.download.streaming_download_manager import xopen
-from huggingface_hub import HfApi
-from lightning import LightningDataModule
-from torch.distributed import get_rank, get_world_size, is_initialized
-from torch.utils.data import DataLoader, IterableDataset, get_worker_info
-from transformers import AutoTokenizer
-from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
-from fish_speech.datasets.protos.text_data_pb2 import SampledData
-from fish_speech.datasets.protos.text_data_stream import read_pb_stream
-from fish_speech.text.clean import clean_text
-from fish_speech.utils import RankedLogger
-from fish_speech.utils.braceexpand import braceexpand
-log = RankedLogger(__name__, rank_zero_only=True)
-def split_by_rank_worker(files):
-    # We need to know the total number of devices
-    # to split the data properly
-    total_devices = 1
-    if is_initialized():
-        total_devices = get_world_size()
-    worker_info = get_worker_info()
-    if worker_info is not None:
-        total_devices *= worker_info.num_workers
-    if len(files) < total_devices:
-        # Repeat the files N times to match the number of devices
-        files = files * (total_devices // len(files) + 1)
-    # DDP
-    if is_initialized():
-        files = files[get_rank() :: get_world_size()]
-    # Split by worker
-    if worker_info is not None:
-        files = files[worker_info.id :: worker_info.num_workers]
-    return files
-class AutoTextSemanticInstructionDataset(IterableDataset):
-    """
-    Auto Augment Dataset by Speaker
-    1. Random concatenate multiple sentences from the same speaker to form a longer sentence
-    2. Automatically normalize the text
-    For interactive mode, we use the following format (multiple sequences):
-    <s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>
-    For non-interactive mode, we use the following format (one long sequence):
-    <s> [INST] text [/INST] ... </s>
-    """
-    def __init__(
-        self,
-        proto_files: list[str],
-        seed: int = 42,
-        interactive_prob: float = 0.5,
-        max_length: int = 1024,
-        tokenizer: AutoTokenizer = None,
-        use_speaker: bool | float = True,
-        causal: bool = True,
-        num_codebooks: Optional[int] = None,
-        skip_text_prob: float = 0.0,
-    ):
-        """
-        Args:
-            proto_files: proto buf files if using local data
-            seed: random seed
-            interactive_prob: probability to use interactive mode
-            max_length: max length of the text
-            tokenizer: tokenizer
-            use_speaker: include speaker information in the prompt
-            causal: use causal sampling when using local data, disable will lead to random sampling
-            num_codebooks: number of codebooks, if None, it will be automatically detected
-            skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode
-        """
-        super().__init__()
-        assert 0 <= interactive_prob <= 1, "interactive_prob must be in [0, 1]"
-        self.seed = seed
-        self.max_length = max_length
-        self.tokenizer = tokenizer
-        self.interactive_prob = interactive_prob
-        self.use_speaker = use_speaker
-        self.proto_files = proto_files
-        self.causal = causal
-        self.num_codebooks = num_codebooks
-        self.skip_text_prob = skip_text_prob
-        self.semantic_token_id = self.tokenizer.convert_tokens_to_ids("<|semantic|>")
-        self.groups = None
-    def init_mock_data_server(self):
-        if self.groups is not None:
-            return
-        # Expand the proto files
-        expanded_proto_files = []
-        for filename in self.proto_files:
-            for i in braceexpand(filename):
-                i = Path(i)
-                if i.is_file():
-                    expanded_proto_files.append(i)
-                elif i.is_dir():
-                    expanded_proto_files.extend(i.rglob("*.proto"))
-                    expanded_proto_files.extend(i.rglob("*.protos"))
-                else:
-                    raise ValueError(f"{i} is not a file or directory")
-        expanded_proto_files = sorted(expanded_proto_files)
-        Random(self.seed).shuffle(expanded_proto_files)
-        self.groups = []
-        shard_proto_files = split_by_rank_worker(expanded_proto_files)
-        log.info(
-            f"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files"
-        )
-        count = 0
-        for filename in shard_proto_files:
-            with open(filename, "rb") as f:
-                for text_data in read_pb_stream(f):
-                    self.groups.append(text_data)
-                    count += 1
-        log.info(f"Read total {count} groups of data")
-        # Shuffle the lines
-        Random(self.seed).shuffle(self.groups)
-        self.group_weights = [len(i.sentences) for i in self.groups]
-    def __iter__(self):
-        while True:
-            yield self.augment()
-    def tokenize_sentence(self, sentence: str):
-        sentence = clean_text(sentence)
-        tokens = self.tokenizer.encode(
-            f"{sentence}",
-            max_length=10**6,
-            add_special_tokens=False,
-            truncation=False,
-        )
-        return sentence, len(tokens)
-    def sample_data(self):
-        if self.groups is None:
-            self.init_mock_data_server()
-        # Shuffle unique lines, estimate that each sample is at least 20 tokens
-        num_samples = self.max_length // 20
-        # choice group based on their number of samples
-        group = random.choices(self.groups, weights=self.group_weights, k=1)[0]
-        if self.causal:
-            # Sample in order
-            if num_samples >= len(group.sentences):
-                samples = group.sentences
-            else:
-                begin = random.randint(0, len(group.sentences) - num_samples)
-                samples = group.sentences[begin : begin + num_samples]
-        else:
-            samples = random.choices(
-                group.sentences, k=min(num_samples, len(group.sentences))
-            )
-        return SampledData(
-            source=group.source,
-            name=group.name,
-            samples=samples,
-        )
-    def augment(self):
-        final_text, final_semantic = [], []
-        response = self.sample_data()
-        if len(response.samples) == 0:
-            # Invalid group
-            return None
-        samples = list(response.samples)
-        idx = 0
-        use_interactive = random.random() < self.interactive_prob
-        if use_interactive is False:
-            # Random sample based on speaker using a truncated normal distribution
-            a = torch.tensor([0], dtype=torch.float32)
-            torch.nn.init.trunc_normal_(
-                a,
-                mean=self.max_length // 2,
-                std=self.max_length // 4,
-                a=10,
-                b=self.max_length,
-            )
-            remaining_tokens = a.long().item() - 4
-        else:
-            remaining_tokens = self.max_length
-        # Use speaker
-        if isinstance(self.use_speaker, float):
-            use_speaker = random.random() < self.use_speaker
-        else:
-            use_speaker = self.use_speaker
-        all_tokens, all_labels = [], []
-        while remaining_tokens > 0 and len(samples) > 0:
-            sentence = samples.pop(0)
-            text = random.choice(sentence.texts)
-            text, length = self.tokenize_sentence(text)
-            remaining_tokens -= length + len(sentence.semantics[0].values)
-            if use_interactive is False:
-                final_text.append(text)
-                final_semantic.append(sentence.semantics)
-            else:
-                # For interactive mode, we only apply speaker for the first sentence
-                # [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST]
-                tokens, labels = self.pack_sentences(
-                    sentences=[text],
-                    semantics=[sentence.semantics],
-                    speaker=response.name if use_speaker else None,
-                    skip_text=random.random() < self.skip_text_prob,
-                )
-                all_tokens.append(tokens)
-                all_labels.append(labels)
-            idx += 1
-        if use_interactive is False:
-            tokens, labels = self.pack_sentences(
-                final_text,
-                semantics=final_semantic,
-                speaker=response.name if use_speaker else None,
-            )
-            all_tokens.append(tokens)
-            all_labels.append(labels)
-        tokens = torch.cat(all_tokens, dim=1)
-        labels = torch.cat(all_labels, dim=1)
-        # Verify that the length is correct
-        assert tokens.size(1) == labels.size(1), f"{tokens.size(1)} != {labels.size(1)}"
-        data = {"tokens": tokens, "labels": labels}
-        return data
-    def pack_sentences(
-        self,
-        sentences: list[str],
-        semantics: list,
-        speaker: Optional[str] = None,
-        skip_text: bool = False,
-    ):
-        if speaker is None:
-            speaker = "assistant"
-        cated_sentences = " ".join(sentences)
-        if skip_text:
-            cated_sentences = "<|skip_text|>"
-        final_text = "<|im_start|>user\n" + cated_sentences + "<|im_end|>"
-        final_text = final_text + f"<|im_start|>{speaker}\n"
-        encoded = self.tokenizer.encode(
-            final_text,
-            add_special_tokens=False,
-            truncation=False,
-            max_length=10**6,
-        )
-        semantic_length = sum([len(i[0].values) for i in semantics])
-        prompt_length = len(encoded)
-        num_codebooks = (
-            len(semantics[0]) if self.num_codebooks is None else self.num_codebooks
-        )
-        # Pack the tokens and semantics (add <s> and </s> to semantic tokens)
-        tokens = (
-            encoded
-            + [self.semantic_token_id] * semantic_length
-            + self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])
-        )
-        # Codebook bos/padding: 0, eos: 1
-        codes = [[CODEBOOK_PAD_TOKEN_ID] * prompt_length for _ in range(num_codebooks)]
-        for segment in semantics:
-            for book_idx, book in zip(range(num_codebooks), segment):
-                for j in book.values:
-                    codes[book_idx].append(int(j) + 1)
-        for book in codes:
-            book.extend([CODEBOOK_PAD_TOKEN_ID] * 1)
-        tokens = [tokens] + codes
-        tokens = torch.tensor(tokens, dtype=torch.long)
-        labels = tokens.clone()
-        if skip_text:
-            # If text is not provided, the sentence is used for condition only, all labels are -100
-            torch.fill_(labels, -100)
-            return tokens, labels
-        # Mask out the <s> tokens for semantic, predict semantic tokens only
-        # Since we don't mask out the input tokens, the language modeling still works
-        labels[1:, :prompt_length] = -100
-        tokens = tokens[:, :-1]
-        labels = labels[:, 1:]
-        # Verify the padding is correct, and the last token is eos
-        assert (tokens[1:, :prompt_length] == CODEBOOK_PAD_TOKEN_ID).all()
-        assert (labels[1:, -1:] == CODEBOOK_PAD_TOKEN_ID).all()
-        return tokens, labels
-@dataclass
-class TextDataCollator:
-    tokenizer: AutoTokenizer
-    max_length: int = 1024
-    def __call__(self, examples):
-        if "negative_tokens" in examples:
-            positive_examples = []
-            negative_examples = []
-            for i in examples:
-                positive_examples.append(
-                    {
-                        "tokens": i["tokens"],
-                        "labels": i["labels"],
-                    }
-                )
-                negative_examples.append(
-                    {
-                        "tokens": i["negative_tokens"],
-                        "labels": i["negative_labels"],
-                    }
-                )
-            examples = positive_examples + negative_examples
-        return self.batchify(examples)
-    def batchify(self, examples, tokens_key="tokens", labels_key="labels"):
-        tokens, attention_masks, labels = [], [], []
-        # Calculate the max length
-        max_tokens_length = 0
-        for example in examples:
-            max_tokens_length = max(max_tokens_length, example[tokens_key].size(1))
-        max_tokens_length = min(max_tokens_length, self.max_length)
-        for example in examples:
-            _tokens = example[tokens_key][:, :max_tokens_length]
-            _labels = example[labels_key][:, :max_tokens_length]
-            _attention_mask = torch.ones((max_tokens_length,), dtype=torch.bool)
-            tokens_length = _tokens.size(1)
-            _attention_mask[:tokens_length] = False
-            assert tokens_length == _labels.size(
-                1
-            ), f"{tokens_length} != {_labels.size(1)}"
-            if tokens_length < max_tokens_length:
-                _tokens = F.pad(
-                    _tokens,
-                    (0, max_tokens_length - tokens_length),
-                    value=self.tokenizer.eos_token_id,
-                )
-                _tokens[1:, tokens_length:] = CODEBOOK_PAD_TOKEN_ID
-                _labels = F.pad(
-                    _labels, (0, max_tokens_length - _labels.size(1)), value=-100
-                )
-            tokens.append(_tokens)
-            attention_masks.append(_attention_mask)
-            labels.append(_labels)
-        tokens = torch.stack(tokens, dim=0)
-        attention_masks = torch.stack(attention_masks, dim=0)
-        labels = torch.stack(labels, dim=0)
-        return {
-            "inputs": tokens,
-            "attention_masks": attention_masks,
-            "labels": labels,
-        }
-class InterleaveDataset(IterableDataset):
-    def __init__(
-        self,
-        datasets: list[IterableDataset],
-        probabilities: list[float],
-        seed: int = 42,
-    ):
-        super().__init__()
-        self.datasets = datasets
-        self.probabilities = probabilities
-        self.seed = seed
-    def __iter__(self):
-        rng = np.random.default_rng(self.seed)
-        dataset_iterators = [iter(dataset) for dataset in self.datasets]
-        while True:
-            # Random choice one
-            dataset_idx = rng.choice(len(self.datasets), p=self.probabilities)
-            dataset_iterator = dataset_iterators[dataset_idx]
-            try:
-                yield next(dataset_iterator)
-            except StopIteration:
-                # Exhausted, create a new iterator
-                dataset_iterators[dataset_idx] = iter(self.datasets[dataset_idx])
-                yield next(dataset_iterators[dataset_idx])
-class SemanticDataModule(LightningDataModule):
-    def __init__(
-        self,
-        train_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
-        val_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
-        batch_size: int = 32,
-        tokenizer: AutoTokenizer = None,
-        max_length: int = 1024,
-        num_workers: int = 4,
-    ):
-        super().__init__()
-        self.train_dataset = train_dataset
-        self.val_dataset = val_dataset
-        self.batch_size = batch_size
-        self.tokenizer = tokenizer
-        self.max_length = max_length
-        self.num_workers = num_workers
-    def train_dataloader(self):
-        return DataLoader(
-            self.train_dataset,
-            batch_size=self.batch_size,
-            collate_fn=TextDataCollator(self.tokenizer, self.max_length),
-            num_workers=self.num_workers,
-            persistent_workers=True,
-        )
-    def val_dataloader(self):
-        return DataLoader(
-            self.val_dataset,
-            batch_size=self.batch_size,
-            collate_fn=TextDataCollator(self.tokenizer, self.max_length),
-            num_workers=self.num_workers,
-            persistent_workers=True,
-        )
-if __name__ == "__main__":
-    from tqdm import tqdm
-    ds = AutoTextSemanticInstructionDataset(
-        ["data/protos"],
-        tokenizer=AutoTokenizer.from_pretrained("fishaudio/fish-speech-1"),
-        use_speaker=False,
-        interactive_prob=1.0,
-        skip_text_prob=0.5,
-    )
-    for i in ds:
-        print(ds.tokenizer.decode(i["tokens"][0], skip_special_tokens=False))
-        # i["labels"][0][i["labels"][0] == -100] = 0
-        # print(ds.tokenizer.decode(i["labels"][0], skip_special_tokens=False))
-        break

+import random
+from dataclasses import dataclass
+from itertools import chain
+from pathlib import Path
+from random import Random
+from typing import Optional, Union
+import numpy as np
+import pyarrow.parquet as pq
+import torch
+import torch.nn.functional as F
+from datasets.download.streaming_download_manager import xopen
+from huggingface_hub import HfApi
+from lightning import LightningDataModule
+from torch.distributed import get_rank, get_world_size, is_initialized
+from torch.utils.data import DataLoader, IterableDataset, get_worker_info
+from transformers import AutoTokenizer
+from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
+from fish_speech.datasets.protos.text_data_pb2 import SampledData
+from fish_speech.datasets.protos.text_data_stream import read_pb_stream
+from fish_speech.text.clean import clean_text
+from fish_speech.utils import RankedLogger
+from fish_speech.utils.braceexpand import braceexpand
+log = RankedLogger(__name__, rank_zero_only=True)
+def split_by_rank_worker(files):
+    # We need to know the total number of devices
+    # to split the data properly
+    total_devices = 1
+    if is_initialized():
+        total_devices = get_world_size()
+    worker_info = get_worker_info()
+    if worker_info is not None:
+        total_devices *= worker_info.num_workers
+    if len(files) < total_devices:
+        # Repeat the files N times to match the number of devices
+        files = files * (total_devices // len(files) + 1)
+    # DDP
+    if is_initialized():
+        files = files[get_rank() :: get_world_size()]
+    # Split by worker
+    if worker_info is not None:
+        files = files[worker_info.id :: worker_info.num_workers]
+    return files
+class AutoTextSemanticInstructionDataset(IterableDataset):
+    """
+    Auto Augment Dataset by Speaker
+    1. Random concatenate multiple sentences from the same speaker to form a longer sentence
+    2. Automatically normalize the text
+    For interactive mode, we use the following format (multiple sequences):
+    <s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>
+    For non-interactive mode, we use the following format (one long sequence):
+    <s> [INST] text [/INST] ... </s>
+    """
+    def __init__(
+        self,
+        proto_files: list[str],
+        seed: int = 42,
+        interactive_prob: float = 0.5,
+        max_length: int = 1024,
+        tokenizer: AutoTokenizer = None,
+        use_speaker: bool | float = True,
+        causal: bool = True,
+        num_codebooks: Optional[int] = None,
+        skip_text_prob: float = 0.0,
+    ):
+        """
+        Args:
+            proto_files: proto buf files if using local data
+            seed: random seed
+            interactive_prob: probability to use interactive mode
+            max_length: max length of the text
+            tokenizer: tokenizer
+            use_speaker: include speaker information in the prompt
+            causal: use causal sampling when using local data, disable will lead to random sampling
+            num_codebooks: number of codebooks, if None, it will be automatically detected
+            skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode
+        """
+        super().__init__()
+        assert 0 <= interactive_prob <= 1, "interactive_prob must be in [0, 1]"
+        self.seed = seed
+        self.max_length = max_length
+        self.tokenizer = tokenizer
+        self.interactive_prob = interactive_prob
+        self.use_speaker = use_speaker
+        self.proto_files = proto_files
+        self.causal = causal
+        self.num_codebooks = num_codebooks
+        self.skip_text_prob = skip_text_prob
+        self.semantic_token_id = self.tokenizer.convert_tokens_to_ids("<|semantic|>")
+        self.groups = None
+    def init_mock_data_server(self):
+        if self.groups is not None:
+            return
+        # Expand the proto files
+        expanded_proto_files = []
+        for filename in self.proto_files:
+            for i in braceexpand(filename):
+                i = Path(i)
+                if i.is_file():
+                    expanded_proto_files.append(i)
+                elif i.is_dir():
+                    expanded_proto_files.extend(i.rglob("*.proto"))
+                    expanded_proto_files.extend(i.rglob("*.protos"))
+                else:
+                    raise ValueError(f"{i} is not a file or directory")
+        expanded_proto_files = sorted(expanded_proto_files)
+        Random(self.seed).shuffle(expanded_proto_files)
+        self.groups = []
+        shard_proto_files = split_by_rank_worker(expanded_proto_files)
+        log.info(
+            f"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files"
+        )
+        count = 0
+        for filename in shard_proto_files:
+            with open(filename, "rb") as f:
+                for text_data in read_pb_stream(f):
+                    self.groups.append(text_data)
+                    count += 1
+        log.info(f"Read total {count} groups of data")
+        # Shuffle the lines
+        Random(self.seed).shuffle(self.groups)
+        self.group_weights = [len(i.sentences) for i in self.groups]
+    def __iter__(self):
+        while True:
+            yield self.augment()
+    def tokenize_sentence(self, sentence: str):
+        sentence = clean_text(sentence)
+        tokens = self.tokenizer.encode(
+            f"{sentence}",
+            max_length=10**6,
+            add_special_tokens=False,
+            truncation=False,
+        )
+        return sentence, len(tokens)
+    def sample_data(self):
+        if self.groups is None:
+            self.init_mock_data_server()
+        # Shuffle unique lines, estimate that each sample is at least 20 tokens
+        num_samples = self.max_length // 20
+        # choice group based on their number of samples
+        group = random.choices(self.groups, weights=self.group_weights, k=1)[0]
+        if self.causal:
+            # Sample in order
+            if num_samples >= len(group.sentences):
+                samples = group.sentences
+            else:
+                begin = random.randint(0, len(group.sentences) - num_samples)
+                samples = group.sentences[begin : begin + num_samples]
+        else:
+            samples = random.choices(
+                group.sentences, k=min(num_samples, len(group.sentences))
+            )
+        return SampledData(
+            source=group.source,
+            name=group.name,
+            samples=samples,
+        )
+    def augment(self):
+        final_text, final_semantic = [], []
+        response = self.sample_data()
+        if len(response.samples) == 0:
+            # Invalid group
+            return None
+        samples = list(response.samples)
+        idx = 0
+        use_interactive = random.random() < self.interactive_prob
+        if use_interactive is False:
+            # Random sample based on speaker using a truncated normal distribution
+            a = torch.tensor([0], dtype=torch.float32)
+            torch.nn.init.trunc_normal_(
+                a,
+                mean=self.max_length // 2,
+                std=self.max_length // 4,
+                a=10,
+                b=self.max_length,
+            )
+            remaining_tokens = a.long().item() - 4
+        else:
+            remaining_tokens = self.max_length
+        # Use speaker
+        if isinstance(self.use_speaker, float):
+            use_speaker = random.random() < self.use_speaker
+        else:
+            use_speaker = self.use_speaker
+        all_tokens, all_labels = [], []
+        while remaining_tokens > 0 and len(samples) > 0:
+            sentence = samples.pop(0)
+            text = random.choice(sentence.texts)
+            text, length = self.tokenize_sentence(text)
+            remaining_tokens -= length + len(sentence.semantics[0].values)
+            if use_interactive is False:
+                final_text.append(text)
+                final_semantic.append(sentence.semantics)
+            else:
+                # For interactive mode, we only apply speaker for the first sentence
+                # [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST]
+                tokens, labels = self.pack_sentences(
+                    sentences=[text],
+                    semantics=[sentence.semantics],
+                    speaker=response.name if use_speaker else None,
+                    skip_text=random.random() < self.skip_text_prob,
+                )
+                all_tokens.append(tokens)
+                all_labels.append(labels)
+            idx += 1
+        if use_interactive is False:
+            tokens, labels = self.pack_sentences(
+                final_text,
+                semantics=final_semantic,
+                speaker=response.name if use_speaker else None,
+            )
+            all_tokens.append(tokens)
+            all_labels.append(labels)
+        tokens = torch.cat(all_tokens, dim=1)
+        labels = torch.cat(all_labels, dim=1)
+        # Verify that the length is correct
+        assert tokens.size(1) == labels.size(1), f"{tokens.size(1)} != {labels.size(1)}"
+        data = {"tokens": tokens, "labels": labels}
+        return data
+    def pack_sentences(
+        self,
+        sentences: list[str],
+        semantics: list,
+        speaker: Optional[str] = None,
+        skip_text: bool = False,
+    ):
+        if speaker is None:
+            speaker = "assistant"
+        cated_sentences = " ".join(sentences)
+        if skip_text:
+            cated_sentences = "<|skip_text|>"
+        final_text = "<|im_start|>user\n" + cated_sentences + "<|im_end|>"
+        final_text = final_text + f"<|im_start|>{speaker}\n"
+        encoded = self.tokenizer.encode(
+            final_text,
+            add_special_tokens=False,
+            truncation=False,
+            max_length=10**6,
+        )
+        semantic_length = sum([len(i[0].values) for i in semantics])
+        prompt_length = len(encoded)
+        num_codebooks = (
+            len(semantics[0]) if self.num_codebooks is None else self.num_codebooks
+        )
+        # Pack the tokens and semantics (add <s> and </s> to semantic tokens)
+        tokens = (
+            encoded
+            + [self.semantic_token_id] * semantic_length
+            + self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])
+        )
+        # Codebook bos/padding: 0, eos: 1
+        codes = [[CODEBOOK_PAD_TOKEN_ID] * prompt_length for _ in range(num_codebooks)]
+        for segment in semantics:
+            for book_idx, book in zip(range(num_codebooks), segment):
+                for j in book.values:
+                    codes[book_idx].append(int(j) + 1)
+        for book in codes:
+            book.extend([CODEBOOK_PAD_TOKEN_ID] * 1)
+        tokens = [tokens] + codes
+        tokens = torch.tensor(tokens, dtype=torch.long)
+        labels = tokens.clone()
+        if skip_text:
+            # If text is not provided, the sentence is used for condition only, all labels are -100
+            torch.fill_(labels, -100)
+            return tokens, labels
+        # Mask out the <s> tokens for semantic, predict semantic tokens only
+        # Since we don't mask out the input tokens, the language modeling still works
+        labels[1:, :prompt_length] = -100
+        tokens = tokens[:, :-1]
+        labels = labels[:, 1:]
+        # Verify the padding is correct, and the last token is eos
+        assert (tokens[1:, :prompt_length] == CODEBOOK_PAD_TOKEN_ID).all()
+        assert (labels[1:, -1:] == CODEBOOK_PAD_TOKEN_ID).all()
+        return tokens, labels
+@dataclass
+class TextDataCollator:
+    tokenizer: AutoTokenizer
+    max_length: int = 1024
+    def __call__(self, examples):
+        if "negative_tokens" in examples:
+            positive_examples = []
+            negative_examples = []
+            for i in examples:
+                positive_examples.append(
+                    {
+                        "tokens": i["tokens"],
+                        "labels": i["labels"],
+                    }
+                )
+                negative_examples.append(
+                    {
+                        "tokens": i["negative_tokens"],
+                        "labels": i["negative_labels"],
+                    }
+                )
+            examples = positive_examples + negative_examples
+        return self.batchify(examples)
+    def batchify(self, examples, tokens_key="tokens", labels_key="labels"):
+        tokens, attention_masks, labels = [], [], []
+        # Calculate the max length
+        max_tokens_length = 0
+        for example in examples:
+            max_tokens_length = max(max_tokens_length, example[tokens_key].size(1))
+        max_tokens_length = min(max_tokens_length, self.max_length)
+        for example in examples:
+            _tokens = example[tokens_key][:, :max_tokens_length]
+            _labels = example[labels_key][:, :max_tokens_length]
+            _attention_mask = torch.ones((max_tokens_length,), dtype=torch.bool)
+            tokens_length = _tokens.size(1)
+            _attention_mask[:tokens_length] = False
+            assert tokens_length == _labels.size(
+                1
+            ), f"{tokens_length} != {_labels.size(1)}"
+            if tokens_length < max_tokens_length:
+                _tokens = F.pad(
+                    _tokens,
+                    (0, max_tokens_length - tokens_length),
+                    value=self.tokenizer.eos_token_id,
+                )
+                _tokens[1:, tokens_length:] = CODEBOOK_PAD_TOKEN_ID
+                _labels = F.pad(
+                    _labels, (0, max_tokens_length - _labels.size(1)), value=-100
+                )
+            tokens.append(_tokens)
+            attention_masks.append(_attention_mask)
+            labels.append(_labels)
+        tokens = torch.stack(tokens, dim=0)
+        attention_masks = torch.stack(attention_masks, dim=0)
+        labels = torch.stack(labels, dim=0)
+        return {
+            "inputs": tokens,
+            "attention_masks": attention_masks,
+            "labels": labels,
+        }
+class InterleaveDataset(IterableDataset):
+    def __init__(
+        self,
+        datasets: list[IterableDataset],
+        probabilities: list[float],
+        seed: int = 42,
+    ):
+        super().__init__()
+        self.datasets = datasets
+        self.probabilities = probabilities
+        self.seed = seed
+    def __iter__(self):
+        rng = np.random.default_rng(self.seed)
+        dataset_iterators = [iter(dataset) for dataset in self.datasets]
+        while True:
+            # Random choice one
+            dataset_idx = rng.choice(len(self.datasets), p=self.probabilities)
+            dataset_iterator = dataset_iterators[dataset_idx]
+            try:
+                yield next(dataset_iterator)
+            except StopIteration:
+                # Exhausted, create a new iterator
+                dataset_iterators[dataset_idx] = iter(self.datasets[dataset_idx])
+                yield next(dataset_iterators[dataset_idx])
+class SemanticDataModule(LightningDataModule):
+    def __init__(
+        self,
+        train_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
+        val_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
+        batch_size: int = 32,
+        tokenizer: AutoTokenizer = None,
+        max_length: int = 1024,
+        num_workers: int = 4,
+    ):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.batch_size = batch_size
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.num_workers = num_workers
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            collate_fn=TextDataCollator(self.tokenizer, self.max_length),
+            num_workers=self.num_workers,
+            persistent_workers=True,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            collate_fn=TextDataCollator(self.tokenizer, self.max_length),
+            num_workers=self.num_workers,
+            persistent_workers=True,
+        )
+if __name__ == "__main__":
+    from tqdm import tqdm
+    ds = AutoTextSemanticInstructionDataset(
+        ["data/protos"],
+        tokenizer=AutoTokenizer.from_pretrained("fishaudio/fish-speech-1"),
+        use_speaker=False,
+        interactive_prob=1.0,
+        skip_text_prob=0.5,
+    )
+    for i in ds:
+        print(ds.tokenizer.decode(i["tokens"][0], skip_special_tokens=False))
+        # i["labels"][0][i["labels"][0] == -100] = 0
+        # print(ds.tokenizer.decode(i["labels"][0], skip_special_tokens=False))
+        break

fish_speech/datasets/text.py DELETED Viewed

@@ -1,661 +0,0 @@
-import random
-from dataclasses import dataclass
-from itertools import chain
-from pathlib import Path
-from random import Random
-from typing import Optional, Union
-import grpc
-import numpy as np
-import pyarrow.parquet as pq
-import torch
-import torch.nn.functional as F
-from datasets.download.streaming_download_manager import xopen
-from huggingface_hub import HfApi
-from lightning import LightningDataModule
-from torch.distributed import get_rank, get_world_size, is_initialized
-from torch.utils.data import DataLoader, IterableDataset, get_worker_info
-from transformers import AutoTokenizer
-from fish_speech.datasets.protos.text_data_pb2 import SampledData
-from fish_speech.datasets.protos.text_data_stream import read_pb_stream
-from fish_speech.text.clean import clean_text
-from fish_speech.utils import RankedLogger
-from fish_speech.utils.braceexpand import braceexpand
-log = RankedLogger(__name__, rank_zero_only=True)
-CODEBOOK_PAD_TOKEN_ID = 0
-CODEBOOK_EOS_TOKEN_ID = 1
-def split_by_rank_worker(files):
-    # We need to know the total number of devices
-    # to split the data properly
-    total_devices = 1
-    if is_initialized():
-        total_devices = get_world_size()
-    worker_info = get_worker_info()
-    if worker_info is not None:
-        total_devices *= worker_info.num_workers
-    if len(files) < total_devices:
-        # Repeat the files N times to match the number of devices
-        files = files * (total_devices // len(files) + 1)
-    # DDP
-    if is_initialized():
-        files = files[get_rank() :: get_world_size()]
-    # Split by worker
-    if worker_info is not None:
-        files = files[worker_info.id :: worker_info.num_workers]
-    return files
-class StreamTextDataset(IterableDataset):
-    def __init__(
-        self,
-        files: Optional[Union[list[str], str]] = None,
-        prefix: Optional[str] = None,
-        seed: int = 42,
-        parquet_batch_size: int = 10000,
-        repo: str = "uonlp/CulturaX",
-        max_length: int = 1024,
-        tokenizer: AutoTokenizer = None,
-    ):
-        super().__init__()
-        self.seed = seed
-        self.parquet_batch_size = parquet_batch_size
-        self.repo = repo
-        self.max_length = max_length
-        self.tokenizer = tokenizer
-        if files is None and prefix is None:
-            raise ValueError("Either files or prefix must be specified")
-        if prefix is not None:
-            files = HfApi().list_repo_files(repo, repo_type="dataset")
-            files = [
-                f for f in files if f.startswith(prefix) and f.endswith(".parquet")
-            ]
-            log.info(f"Found {len(files)} files in {repo} with prefix {prefix}")
-        else:
-            if isinstance(files, str):
-                files = [files]
-            files = list(chain.from_iterable(map(braceexpand, files)))
-            log.info(f"Expanded {len(files)} files in {repo}")
-        # Get sharded files
-        self.files = sorted(files)
-        Random(seed).shuffle(self.files)
-    def __iter__(self):
-        files = split_by_rank_worker(self.files)
-        random.shuffle(files)
-        for filename in files:
-            try:
-                yield from self.parse_data(filename)
-            except Exception as e:
-                log.exception(f"Failed to parse {filename}: {e}")
-    def parse_data(self, filename: str):
-        for data in self.parse_data_internal(filename):
-            text = data["text"]
-            # encode
-            tokens = self.tokenizer.encode(
-                text,
-                add_special_tokens=False,
-                truncation=False,
-                max_length=10**6,
-            )
-            # Random choice self.max_length
-            if len(tokens) > self.max_length:
-                start = random.randint(0, len(tokens) - self.max_length)
-                tokens = tokens[start : start + self.max_length - 1]
-            tokens = (
-                [self.tokenizer.bos_token_id] + tokens + [self.tokenizer.eos_token_id]
-            )
-            # Pad dims
-            placeholder_multi_codebook = torch.zeros((4, len(tokens)), dtype=torch.long)
-            tokens = torch.concat(
-                [
-                    torch.tensor([tokens], dtype=torch.long),
-                    placeholder_multi_codebook,
-                ],
-                dim=0,
-            )
-            labels = tokens.clone()
-            tokens = tokens[:, :-1]
-            labels = labels[:, 1:]
-            labels[1:] = -100  # remove all placeholders
-            yield {"tokens": tokens, "labels": labels}
-    def parse_data_internal(self, filename: str):
-        url = f"https://huggingface.co/datasets/{self.repo}/resolve/main/{filename}"
-        with xopen(url, mode="rb") as stream:
-            parquet_file = pq.ParquetFile(stream)
-            for batch in parquet_file.iter_batches(
-                batch_size=self.parquet_batch_size, columns=["text"]
-            ):
-                # In-batch shuffling
-                texts = [{"text": text.as_py()} for text in batch["text"]]
-                random.shuffle(texts)
-                yield from texts
-class AutoAugTextDataset(IterableDataset):
-    """
-    Auto Augment Dataset by Speaker
-    1. Random concatenate multiple sentences from the same speaker to form a longer sentence
-    2. Automatically normalize the text
-    For interactive mode, we use the following format (multiple sequences):
-    <s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>
-    For non-interactive mode, we use the following format (one long sequence):
-    <s> [INST] text [/INST] ... </s>
-    """
-    def __init__(
-        self,
-        proto_files: list[str],
-        seed: int = 42,
-        interactive_prob: float = 0.5,
-        max_length: int = 1024,
-        tokenizer: AutoTokenizer = None,
-        use_speaker: bool = True,
-        causual: bool = True,
-        use_negative_samples: bool = False,
-        num_codebooks: Optional[int] = None,
-    ):
-        """
-        Args:
-            proto_files: proto buf files if using local data
-            seed: random seed
-            interactive_prob: probability to use interactive mode
-            max_length: max length of the text
-            tokenizer: tokenizer
-            use_speaker: include speaker information in the prompt
-            causual: use causual sampling when using local data, disable will lead to random sampling
-            use_negative_samples: generate negative samples
-            num_codebooks: number of codebooks, if None, it will be automatically detected
-        """
-        super().__init__()
-        assert 0 <= interactive_prob <= 1, "interactive_prob must be in [0, 1]"
-        self.seed = seed
-        self.max_length = max_length
-        self.tokenizer = tokenizer
-        self.interactive_prob = interactive_prob
-        self.use_speaker = use_speaker
-        self.proto_files = proto_files
-        self.causual = causual
-        self.use_negative_samples = use_negative_samples
-        self.num_codebooks = num_codebooks
-        self.semantic_token_id = self.tokenizer.convert_tokens_to_ids("<|semantic|>")
-        self.groups = None
-    def init_mock_data_server(self):
-        if self.groups is not None:
-            return
-        # Expand the proto files
-        expanded_proto_files = []
-        for filename in self.proto_files:
-            for i in braceexpand(filename):
-                i = Path(i)
-                if i.is_file():
-                    expanded_proto_files.append(i)
-                elif i.is_dir():
-                    expanded_proto_files.extend(i.rglob("*.proto"))
-                    expanded_proto_files.extend(i.rglob("*.protos"))
-                else:
-                    raise ValueError(f"{i} is not a file or directory")
-        expanded_proto_files = sorted(expanded_proto_files)
-        Random(self.seed).shuffle(expanded_proto_files)
-        self.groups = []
-        shard_proto_files = split_by_rank_worker(expanded_proto_files)
-        log.info(
-            f"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files"
-        )
-        count = 0
-        for filename in shard_proto_files:
-            with open(filename, "rb") as f:
-                for text_data in read_pb_stream(f):
-                    self.groups.append(text_data)
-                    count += 1
-        log.info(f"Read total {count} groups of data")
-        # Shuffle the lines
-        Random(self.seed).shuffle(self.groups)
-        self.group_weights = [len(i.sentences) for i in self.groups]
-    def __iter__(self):
-        while True:
-            yield self.augment()
-    def tokenize_sentence(self, sentence: str):
-        sentence = clean_text(sentence)
-        tokens = self.tokenizer.encode(
-            f"{sentence}",
-            max_length=10**6,
-            add_special_tokens=False,
-            truncation=False,
-        )
-        return sentence, len(tokens)
-    def sample_data(self):
-        if self.groups is None:
-            self.init_mock_data_server()
-        # Shuffle unique lines, estimate that each sample is at least 20 tokens
-        num_samples = self.max_length // 20
-        # choice group based on their number of samples
-        group = random.choices(self.groups, weights=self.group_weights, k=1)[0]
-        if self.causual:
-            # Sample in order
-            if num_samples >= len(group.sentences):
-                samples = group.sentences
-            else:
-                begin = random.randint(0, len(group.sentences) - num_samples)
-                samples = group.sentences[begin : begin + num_samples]
-        else:
-            samples = random.choices(
-                group.sentences, k=min(num_samples, len(group.sentences))
-            )
-        return SampledData(
-            source=group.source,
-            name=group.name,
-            samples=samples,
-        )
-    def augment(self):
-        # Random sample based on speaker using a truncated normal distribution
-        a = torch.tensor([0], dtype=torch.float32)
-        torch.nn.init.trunc_normal_(
-            a,
-            mean=self.max_length // 2,
-            std=self.max_length // 4,
-            a=10,
-            b=self.max_length,
-        )
-        remaining_tokens = a.long().item() - 4
-        final_text, final_semantic = [], []
-        response = self.sample_data()
-        if len(response.samples) == 0:
-            # Invalid group
-            return None
-        samples = list(response.samples)
-        idx = 0
-        use_interactive = random.random() < self.interactive_prob
-        all_tokens, all_labels = [], []
-        while remaining_tokens > 0 and len(samples) > 0:
-            sentence = samples.pop(0)
-            text = random.choice(sentence.texts)
-            text, length = self.tokenize_sentence(text)
-            remaining_tokens -= length + len(sentence.semantics[0].values)
-            if use_interactive is False:
-                final_text.append(text)
-                final_semantic.append(sentence.semantics)
-            else:
-                # For interactive mode, we only apply speaker for the first sentence
-                # [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST]
-                tokens, labels = self.pack_sentences(
-                    sentences=[text],
-                    semantics=[sentence.semantics],
-                    speaker=response.name if (self.use_speaker and idx == 0) else None,
-                    add_bos=idx == 0,
-                )
-                all_tokens.append(tokens)
-                all_labels.append(labels)
-            idx += 1
-        if use_interactive is False:
-            tokens, labels = self.pack_sentences(
-                final_text,
-                semantics=final_semantic,
-                speaker=response.name if self.use_speaker else None,
-                add_bos=True,
-            )
-            all_tokens.append(tokens)
-            all_labels.append(labels)
-        tokens = torch.cat(all_tokens, dim=1)
-        labels = torch.cat(all_labels, dim=1)
-        # Verify that the length is correct
-        assert tokens.size(1) == labels.size(1), f"{tokens.size(1)} != {labels.size(1)}"
-        # Verify bos token
-        assert tokens[0, 0] == self.tokenizer.bos_token_id
-        data = {"tokens": tokens, "labels": labels}
-        if self.use_negative_samples:
-            negative_samples = self.generate_negative_samples(all_tokens, all_labels)
-            data.update(negative_samples)
-        return data
-    def generate_negative_samples(self, all_tokens, all_labels):
-        new_tokens, new_labels = [], []
-        for tokens, labels in zip(all_tokens, all_labels):
-            # If all codebooks are not -100, we find where it starts
-            start = torch.where(labels[1:].sum(0) != -100 * (labels.size(0) - 1))[0][0]
-            assert (labels[1:, start:] != -100).all()  # This shouldn't happen
-            mode = random.choice(["repeat", "lost", "noise"])
-            begin = random.randint(start, labels.size(1) - 1)
-            end = random.randint(begin, labels.size(1) - 1)
-            if mode == "repeat":
-                tokens = torch.cat(
-                    [
-                        tokens[:, :begin],
-                        tokens[:, begin:end],
-                        tokens[:, begin:end],
-                        tokens[:, end:],
-                    ],
-                    dim=1,
-                )
-                labels = torch.cat(
-                    [
-                        labels[:, :begin],
-                        labels[:, begin:end],
-                        labels[:, begin:end],
-                        labels[:, end:],
-                    ],
-                    dim=1,
-                )
-            elif mode == "lost":
-                tokens = torch.cat([tokens[:, :begin], tokens[:, end:]], dim=1)
-                labels = torch.cat([labels[:, :begin], labels[:, end:]], dim=1)
-            elif mode == "noise":
-                middle_tokens, middle_labels = (
-                    tokens[:, begin:end],
-                    labels[:, begin:end],
-                )
-                random_order0 = torch.randperm(middle_tokens.size(1))
-                random_order1 = torch.randperm(middle_tokens.size(1))
-                middle_tokens = middle_tokens[:, random_order0]
-                middle_labels = middle_labels[:, random_order1]
-                tokens = torch.cat(
-                    [tokens[:, :begin], middle_tokens, tokens[:, end:]], dim=1
-                )
-                labels = torch.cat(
-                    [labels[:, :begin], middle_labels, labels[:, end:]], dim=1
-                )
-            new_tokens.append(tokens)
-            new_labels.append(labels)
-        tokens = torch.cat(new_tokens, dim=1)
-        labels = torch.cat(new_labels, dim=1)
-        # Verify that the length is correct
-        assert tokens.size(1) == labels.size(1), f"{tokens.size(1)} != {labels.size(1)}"
-        return {"negative_tokens": tokens, "negative_labels": labels}
-    def pack_sentences(
-        self,
-        sentences: list[str],
-        semantics=list,
-        speaker: Optional[str] = None,
-        add_bos: bool = True,
-    ):
-        if speaker is not None:
-            sentences = [f"[SPK: {speaker}]"] + sentences
-        final_text = "<|im_start|>user<|im_sep|>" + " ".join(sentences) + "<|im_end|>"
-        final_text = final_text + "<|im_start|>assistant<|im_sep|>"
-        encoded = self.tokenizer.encode(
-            final_text,
-            add_special_tokens=False,
-            truncation=False,
-            max_length=10**6,
-        )
-        semantic_length = sum([len(i[0].values) for i in semantics])
-        prompt_length = len(encoded)
-        num_codebooks = (
-            len(semantics[0]) if self.num_codebooks is None else self.num_codebooks
-        )
-        bos_bias = 1 if add_bos else 0
-        # Pack the tokens and semantics (add <s> and </s> to semantic tokens)
-        tokens = (
-            encoded
-            + [self.semantic_token_id] * semantic_length
-            + self.tokenizer.convert_tokens_to_ids(
-                ["<|im_end|>", "<|end_of_sequence|>"]
-            )
-        )
-        if add_bos:
-            tokens = [self.tokenizer.bos_token_id] + tokens
-        # Codebook bos/padding: 0, eos: 1
-        codes = [
-            [CODEBOOK_PAD_TOKEN_ID] * (prompt_length + bos_bias)
-            for _ in range(num_codebooks)
-        ]
-        for segment in semantics:
-            for book_idx, book in zip(range(num_codebooks), segment):
-                for j in book.values:
-                    codes[book_idx].append(int(j) + 2)
-        for book in codes:
-            book.extend([CODEBOOK_EOS_TOKEN_ID] * 2)
-        tokens = [tokens] + codes
-        tokens = torch.tensor(tokens, dtype=torch.long)
-        labels = tokens.clone()
-        # Mask out the <s> tokens for semantic, predict semantic tokens only
-        # Since we don't mask out the input tokens, the language modeling still works
-        labels[1:, : (prompt_length + bos_bias)] = -100
-        tokens = tokens[:, :-1]
-        labels = labels[:, 1:]
-        # Verify the padding is correct, and the last token is eos
-        assert add_bos is False or tokens[0, 0] == self.tokenizer.bos_token_id
-        assert (tokens[1:, : prompt_length + bos_bias] == CODEBOOK_PAD_TOKEN_ID).all()
-        assert labels[0, -1] == self.tokenizer.eos_token_id
-        assert (labels[1:, -2:] == CODEBOOK_EOS_TOKEN_ID).all()
-        return tokens, labels
-@dataclass
-class TextDataCollator:
-    tokenizer: AutoTokenizer
-    max_length: int = 1024
-    def __call__(self, examples):
-        if "negative_tokens" in examples:
-            positive_examples = []
-            negative_examples = []
-            for i in examples:
-                positive_examples.append(
-                    {
-                        "tokens": i["tokens"],
-                        "labels": i["labels"],
-                    }
-                )
-                negative_examples.append(
-                    {
-                        "tokens": i["negative_tokens"],
-                        "labels": i["negative_labels"],
-                    }
-                )
-            examples = positive_examples + negative_examples
-        return self.batchify(examples)
-    def batchify(self, examples, tokens_key="tokens", labels_key="labels"):
-        tokens, attention_masks, labels = [], [], []
-        # Calculate the max length
-        max_tokens_length = 0
-        for example in examples:
-            max_tokens_length = max(max_tokens_length, example[tokens_key].size(1))
-        max_tokens_length = min(max_tokens_length, self.max_length)
-        for example in examples:
-            _tokens = example[tokens_key][:, :max_tokens_length]
-            _labels = example[labels_key][:, :max_tokens_length]
-            _attention_mask = torch.ones((max_tokens_length,), dtype=torch.bool)
-            tokens_length = _tokens.size(1)
-            _attention_mask[:tokens_length] = False
-            assert tokens_length == _labels.size(
-                1
-            ), f"{tokens_length} != {_labels.size(1)}"
-            if tokens_length < max_tokens_length:
-                _tokens = F.pad(
-                    _tokens,
-                    (0, max_tokens_length - tokens_length),
-                    value=self.tokenizer.eos_token_id,
-                )
-                _tokens[1:, tokens_length:] = CODEBOOK_PAD_TOKEN_ID
-                _labels = F.pad(
-                    _labels, (0, max_tokens_length - _labels.size(1)), value=-100
-                )
-            tokens.append(_tokens)
-            attention_masks.append(_attention_mask)
-            labels.append(_labels)
-        tokens = torch.stack(tokens, dim=0)
-        attention_masks = torch.stack(attention_masks, dim=0)
-        labels = torch.stack(labels, dim=0)
-        return {
-            "inputs": tokens,
-            "attention_masks": attention_masks,
-            "labels": labels,
-        }
-class InterleaveDataset(IterableDataset):
-    def __init__(
-        self,
-        datasets: list[IterableDataset],
-        probabilities: list[float],
-        seed: int = 42,
-    ):
-        super().__init__()
-        self.datasets = datasets
-        self.probabilities = probabilities
-        self.seed = seed
-    def __iter__(self):
-        rng = np.random.default_rng(self.seed)
-        dataset_iterators = [iter(dataset) for dataset in self.datasets]
-        while True:
-            # Random choice one
-            dataset_idx = rng.choice(len(self.datasets), p=self.probabilities)
-            dataset_iterator = dataset_iterators[dataset_idx]
-            try:
-                yield next(dataset_iterator)
-            except StopIteration:
-                # Exhausted, create a new iterator
-                dataset_iterators[dataset_idx] = iter(self.datasets[dataset_idx])
-                yield next(dataset_iterators[dataset_idx])
-class TextDataModule(LightningDataModule):
-    def __init__(
-        self,
-        train_dataset: Union[StreamTextDataset, AutoAugTextDataset, InterleaveDataset],
-        val_dataset: Union[StreamTextDataset, AutoAugTextDataset, InterleaveDataset],
-        batch_size: int = 32,
-        tokenizer: AutoTokenizer = None,
-        max_length: int = 1024,
-        num_workers: int = 4,
-    ):
-        super().__init__()
-        self.train_dataset = train_dataset
-        self.val_dataset = val_dataset
-        self.batch_size = batch_size
-        self.tokenizer = tokenizer
-        self.max_length = max_length
-        self.num_workers = num_workers
-    def train_dataloader(self):
-        return DataLoader(
-            self.train_dataset,
-            batch_size=self.batch_size,
-            collate_fn=TextDataCollator(self.tokenizer, self.max_length),
-            num_workers=self.num_workers,
-        )
-    def val_dataloader(self):
-        return DataLoader(
-            self.val_dataset,
-            batch_size=self.batch_size,
-            collate_fn=TextDataCollator(self.tokenizer, self.max_length),
-            num_workers=self.num_workers,
-        )
-if __name__ == "__main__":
-    from tqdm import tqdm
-    ds = AutoAugTextDataset(
-        ["data/protos"],
-        tokenizer=AutoTokenizer.from_pretrained("fishaudio/fish-speech-1"),
-        use_speaker=False,
-        interactive_prob=1.0,
-        use_negative_samples=False,
-    )
-    for i in ds:
-        print(ds.tokenizer.decode(i["tokens"][0], skip_special_tokens=False))
-        # i["labels"][0][i["labels"][0] == -100] = 0
-        # print(ds.tokenizer.decode(i["labels"][0], skip_special_tokens=False))
-        break

fish_speech/datasets/vqgan.py CHANGED Viewed

@@ -1,147 +1,147 @@
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Optional
-import librosa
-import numpy as np
-import torch
-from lightning import LightningDataModule
-from torch.utils.data import DataLoader, Dataset
-from fish_speech.utils import RankedLogger
-logger = RankedLogger(__name__, rank_zero_only=False)
-class VQGANDataset(Dataset):
-    def __init__(
-        self,
-        filelist: str,
-        sample_rate: int = 32000,
-        hop_length: int = 640,
-        slice_frames: Optional[int] = None,
-    ):
-        super().__init__()
-        filelist = Path(filelist)
-        root = filelist.parent
-        self.files = [
-            root / line.strip()
-            for line in filelist.read_text(encoding="utf-8").splitlines()
-            if line.strip()
-        ]
-        self.sample_rate = sample_rate
-        self.hop_length = hop_length
-        self.slice_frames = slice_frames
-    def __len__(self):
-        return len(self.files)
-    def get_item(self, idx):
-        file = self.files[idx]
-        audio, _ = librosa.load(file, sr=self.sample_rate, mono=True)
-        # Slice audio and features
-        if (
-            self.slice_frames is not None
-            and audio.shape[0] > self.slice_frames * self.hop_length
-        ):
-            start = np.random.randint(
-                0, audio.shape[0] - self.slice_frames * self.hop_length
-            )
-            audio = audio[start : start + self.slice_frames * self.hop_length]
-        if len(audio) == 0:
-            return None
-        max_value = np.abs(audio).max()
-        if max_value > 1.0:
-            audio = audio / max_value
-        return {
-            "audio": torch.from_numpy(audio),
-        }
-    def __getitem__(self, idx):
-        try:
-            return self.get_item(idx)
-        except Exception as e:
-            import traceback
-            traceback.print_exc()
-            logger.error(f"Error loading {self.files[idx]}: {e}")
-            return None
-@dataclass
-class VQGANCollator:
-    def __call__(self, batch):
-        batch = [x for x in batch if x is not None]
-        audio_lengths = torch.tensor([len(x["audio"]) for x in batch])
-        audio_maxlen = audio_lengths.max()
-        # Rounds up to nearest multiple of 2 (audio_lengths)
-        audios = []
-        for x in batch:
-            audios.append(
-                torch.nn.functional.pad(x["audio"], (0, audio_maxlen - len(x["audio"])))
-            )
-        return {
-            "audios": torch.stack(audios),
-            "audio_lengths": audio_lengths,
-        }
-class VQGANDataModule(LightningDataModule):
-    def __init__(
-        self,
-        train_dataset: VQGANDataset,
-        val_dataset: VQGANDataset,
-        batch_size: int = 32,
-        num_workers: int = 4,
-        val_batch_size: Optional[int] = None,
-    ):
-        super().__init__()
-        self.train_dataset = train_dataset
-        self.val_dataset = val_dataset
-        self.batch_size = batch_size
-        self.val_batch_size = val_batch_size or batch_size
-        self.num_workers = num_workers
-    def train_dataloader(self):
-        return DataLoader(
-            self.train_dataset,
-            batch_size=self.batch_size,
-            collate_fn=VQGANCollator(),
-            num_workers=self.num_workers,
-            shuffle=True,
-            persistent_workers=True,
-        )
-    def val_dataloader(self):
-        return DataLoader(
-            self.val_dataset,
-            batch_size=self.val_batch_size,
-            collate_fn=VQGANCollator(),
-            num_workers=self.num_workers,
-            persistent_workers=True,
-        )
-if __name__ == "__main__":
-    dataset = VQGANDataset("data/LibriTTS_R/vq_train_filelist.txt")
-    dataloader = DataLoader(
-        dataset, batch_size=4, shuffle=False, collate_fn=VQGANCollator()
-    )
-    for batch in dataloader:
-        print(batch["audios"].shape)
-        print(batch["features"].shape)
-        print(batch["audio_lengths"])
-        print(batch["feature_lengths"])
-        break

+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import librosa
+import numpy as np
+import torch
+from lightning import LightningDataModule
+from torch.utils.data import DataLoader, Dataset
+from fish_speech.utils import RankedLogger
+logger = RankedLogger(__name__, rank_zero_only=False)
+class VQGANDataset(Dataset):
+    def __init__(
+        self,
+        filelist: str,
+        sample_rate: int = 32000,
+        hop_length: int = 640,
+        slice_frames: Optional[int] = None,
+    ):
+        super().__init__()
+        filelist = Path(filelist)
+        root = filelist.parent
+        self.files = [
+            root / line.strip()
+            for line in filelist.read_text(encoding="utf-8").splitlines()
+            if line.strip()
+        ]
+        self.sample_rate = sample_rate
+        self.hop_length = hop_length
+        self.slice_frames = slice_frames
+    def __len__(self):
+        return len(self.files)
+    def get_item(self, idx):
+        file = self.files[idx]
+        audio, _ = librosa.load(file, sr=self.sample_rate, mono=True)
+        # Slice audio and features
+        if (
+            self.slice_frames is not None
+            and audio.shape[0] > self.slice_frames * self.hop_length
+        ):
+            start = np.random.randint(
+                0, audio.shape[0] - self.slice_frames * self.hop_length
+            )
+            audio = audio[start : start + self.slice_frames * self.hop_length]
+        if len(audio) == 0:
+            return None
+        max_value = np.abs(audio).max()
+        if max_value > 1.0:
+            audio = audio / max_value
+        return {
+            "audio": torch.from_numpy(audio),
+        }
+    def __getitem__(self, idx):
+        try:
+            return self.get_item(idx)
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            logger.error(f"Error loading {self.files[idx]}: {e}")
+            return None
+@dataclass
+class VQGANCollator:
+    def __call__(self, batch):
+        batch = [x for x in batch if x is not None]
+        audio_lengths = torch.tensor([len(x["audio"]) for x in batch])
+        audio_maxlen = audio_lengths.max()
+        # Rounds up to nearest multiple of 2 (audio_lengths)
+        audios = []
+        for x in batch:
+            audios.append(
+                torch.nn.functional.pad(x["audio"], (0, audio_maxlen - len(x["audio"])))
+            )
+        return {
+            "audios": torch.stack(audios),
+            "audio_lengths": audio_lengths,
+        }
+class VQGANDataModule(LightningDataModule):
+    def __init__(
+        self,
+        train_dataset: VQGANDataset,
+        val_dataset: VQGANDataset,
+        batch_size: int = 32,
+        num_workers: int = 4,
+        val_batch_size: Optional[int] = None,
+    ):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.batch_size = batch_size
+        self.val_batch_size = val_batch_size or batch_size
+        self.num_workers = num_workers
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            collate_fn=VQGANCollator(),
+            num_workers=self.num_workers,
+            shuffle=True,
+            persistent_workers=True,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.val_batch_size,
+            collate_fn=VQGANCollator(),
+            num_workers=self.num_workers,
+            persistent_workers=True,
+        )
+if __name__ == "__main__":
+    dataset = VQGANDataset("data/LibriTTS_R/vq_train_filelist.txt")
+    dataloader = DataLoader(
+        dataset, batch_size=4, shuffle=False, collate_fn=VQGANCollator()
+    )
+    for batch in dataloader:
+        print(batch["audios"].shape)
+        print(batch["features"].shape)
+        print(batch["audio_lengths"])
+        print(batch["feature_lengths"])
+        break

fish_speech/i18n/README.md CHANGED Viewed

@@ -1,27 +1,27 @@
-## i18n Folder Attribution
-The `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below:
-### fish_speech/i18n/core.py
-**Related code from RVC:**
-[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py)
-**Initial commit:**
-add localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#35](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/35)
-**Initial author:**
-[@L4Ph](https://github.com/L4Ph)
-### fish_speech/i18n/scan.py
-**Related code from RVC:**
-[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py)
-**Initial commit:**
-File for detecting i18n missing keys [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#1058](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/1058)
-**Initial author:**
-[@towzeur](https://github.com/towzeur)
-We appreciate the contributions of the RVC project and its authors.

+## i18n Folder Attribution
+The `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below:
+### fish_speech/i18n/core.py
+**Related code from RVC:**
+[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py)
+**Initial commit:**
+add localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#35](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/35)
+**Initial author:**
+[@L4Ph](https://github.com/L4Ph)
+### fish_speech/i18n/scan.py
+**Related code from RVC:**
+[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py)
+**Initial commit:**
+File for detecting i18n missing keys [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#1058](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/1058)
+**Initial author:**
+[@towzeur](https://github.com/towzeur)
+We appreciate the contributions of the RVC project and its authors.

fish_speech/i18n/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .core import i18n
-__all__ = ["i18n"]


1	+ from .core import i18n
2	+
3	+ __all__ = ["i18n"]

fish_speech/i18n/core.py CHANGED Viewed

@@ -1,40 +1,40 @@
-import json
-import locale
-from pathlib import Path
-I18N_FILE_PATH = Path(__file__).parent / "locale"
-DEFAULT_LANGUAGE = "en_US"
-def load_language_list(language):
-    with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f:
-        language_list = json.load(f)
-    return language_list
-class I18nAuto:
-    def __init__(self):
-        i18n_file = Path(".locale")
-        if i18n_file.exists():
-            with open(i18n_file, "r", encoding="utf-8") as f:
-                language = f.read().strip()
-        else:
-            # getlocale can't identify the system's language ((None, None))
-            language = locale.getdefaultlocale()[0]
-        if (I18N_FILE_PATH / f"{language}.json").exists() is False:
-            language = DEFAULT_LANGUAGE
-        self.language = language
-        self.language_map = load_language_list(language)
-    def __call__(self, key):
-        return self.language_map.get(key, key)
-    def __repr__(self):
-        return "Use Language: " + self.language
-i18n = I18nAuto()

+import json
+import locale
+from pathlib import Path
+I18N_FILE_PATH = Path(__file__).parent / "locale"
+DEFAULT_LANGUAGE = "en_US"
+def load_language_list(language):
+    with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f:
+        language_list = json.load(f)
+    return language_list
+class I18nAuto:
+    def __init__(self):
+        i18n_file = Path(".locale")
+        if i18n_file.exists():
+            with open(i18n_file, "r", encoding="utf-8") as f:
+                language = f.read().strip()
+        else:
+            # getlocale can't identify the system's language ((None, None))
+            language = locale.getdefaultlocale()[0]
+        if (I18N_FILE_PATH / f"{language}.json").exists() is False:
+            language = DEFAULT_LANGUAGE
+        self.language = language
+        self.language_map = load_language_list(language)
+    def __call__(self, key):
+        return self.language_map.get(key, key)
+    def __repr__(self):
+        return "Use Language: " + self.language
+i18n = I18nAuto()

fish_speech/i18n/locale/en_US.json CHANGED Viewed

@@ -1,122 +1,123 @@
-{
-  "16-mixed is recommended for 10+ series GPU": "16-mixed is recommended for 10+ series GPU",
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 to 10 seconds of reference audio, useful for specifying speaker.",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).",
-  "Accumulate Gradient Batches": "Accumulate Gradient Batches",
-  "Add to Processing Area": "Add to Processing Area",
-  "Added path successfully!": "Added path successfully!",
-  "Advanced Config": "Advanced Config",
-  "Base LLAMA Model": "Base LLAMA Model",
-  "Batch Inference": "Batch Inference",
-  "Batch Size": "Batch Size",
-  "Changing with the Model Path": "Changing with the Model Path",
-  "Chinese": "Chinese",
-  "Compile Model": "Compile Model",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compile the model can significantly reduce the inference time, but will increase cold start time",
-  "Copy": "Copy",
-  "Data Preprocessing": "Data Preprocessing",
-  "Data Preprocessing Path": "Data Preprocessing Path",
-  "Data Source": "Data Source",
-  "Decoder Model Config": "Decoder Model Config",
-  "Decoder Model Path": "Decoder Model Path",
-  "Disabled": "Disabled",
-  "Enable Reference Audio": "Enable Reference Audio",
-  "English": "English",
-  "Error Message": "Error Message",
-  "File Preprocessing": "File Preprocessing",
-  "Generate": "Generate",
-  "Generated Audio": "Generated Audio",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format",
-  "Infer interface is closed": "Infer interface is closed",
-  "Inference Configuration": "Inference Configuration",
-  "Inference Server Configuration": "Inference Server Configuration",
-  "Inference Server Error": "Inference Server Error",
-  "Inferring interface is launched at {}": "Inferring interface is launched at {}",
-  "Initial Learning Rate": "Initial Learning Rate",
-  "Input Audio & Source Path for Transcription": "Input Audio & Source Path for Transcription",
-  "Input Text": "Input Text",
-  "Invalid path: {}": "Invalid path: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "It is recommended to use CUDA, if you have low configuration, use CPU",
-  "Iterative Prompt Length, 0 means off": "Iterative Prompt Length, 0 means off",
-  "Japanese": "Japanese",
-  "LLAMA Configuration": "LLAMA Configuration",
-  "LLAMA Model Config": "LLAMA Model Config",
-  "LLAMA Model Path": "LLAMA Model Path",
-  "Labeling Device": "Labeling Device",
-  "LoRA Model to be merged": "LoRA Model to be merged",
-  "Maximum Audio Duration": "Maximum Audio Duration",
-  "Maximum Length per Sample": "Maximum Length per Sample",
-  "Maximum Training Steps": "Maximum Training Steps",
-  "Maximum tokens per batch, 0 means no limit": "Maximum tokens per batch, 0 means no limit",
-  "Merge": "Merge",
-  "Merge LoRA": "Merge LoRA",
-  "Merge successfully": "Merge successfully",
-  "Minimum Audio Duration": "Minimum Audio Duration",
-  "Model Output Path": "Model Output Path",
-  "Model Size": "Model Size",
-  "Move": "Move",
-  "Move files successfully": "Move files successfully",
-  "No audio generated, please check the input text.": "No audio generated, please check the input text.",
-  "No selected options": "No selected options",
-  "Number of Workers": "Number of Workers",
-  "Open Inference Server": "Open Inference Server",
-  "Open Labeler WebUI": "Open Labeler WebUI",
-  "Open Tensorboard": "Open Tensorboard",
-  "Opened labeler in browser": "Opened labeler in browser",
-  "Optional Label Language": "Optional Label Language",
-  "Optional online ver": "Optional online ver",
-  "Output Path": "Output Path",
-  "Path error, please check the model file exists in the corresponding path": "Path error, please check the model file exists in the corresponding path",
-  "Precision": "Precision",
-  "Probability of applying Speaker Condition": "Probability of applying Speaker Condition",
-  "Put your text here.": "Put your text here.",
-  "Reference Audio": "Reference Audio",
-  "Reference Text": "Reference Text",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "Related code and weights are released under CC BY-NC-SA 4.0 License.",
-  "Remove Selected Data": "Remove Selected Data",
-  "Removed path successfully!": "Removed path successfully!",
-  "Repetition Penalty": "Repetition Penalty",
-  "Save model every n steps": "Save model every n steps",
-  "Select LLAMA ckpt": "Select LLAMA ckpt",
-  "Select VITS ckpt": "Select VITS ckpt",
-  "Select VQGAN ckpt": "Select VQGAN ckpt",
-  "Select source file processing method": "Select source file processing method",
-  "Select the model to be trained (Depending on the Tab page you are on)": "Select the model to be trained (Depending on the Tab page you are on)",
-  "Selected: {}": "Selected: {}",
-  "Speaker": "Speaker",
-  "Speaker is identified by the folder name": "Speaker is identified by the folder name",
-  "Start Training": "Start Training",
-  "Streaming Audio": "Streaming Audio",
-  "Streaming Generate": "Streaming Generate",
-  "Tensorboard Host": "Tensorboard Host",
-  "Tensorboard Log Path": "Tensorboard Log Path",
-  "Tensorboard Port": "Tensorboard Port",
-  "Tensorboard interface is closed": "Tensorboard interface is closed",
-  "Tensorboard interface is launched at {}": "Tensorboard interface is launched at {}",
-  "Text is too long, please keep it under {} characters.": "Text is too long, please keep it under {} characters.",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.",
-  "Training Configuration": "Training Configuration",
-  "Training Error": "Training Error",
-  "Training stopped": "Training stopped",
-  "Type name of the speaker": "Type name of the speaker",
-  "Type the path or select from the dropdown": "Type the path or select from the dropdown",
-  "Use LoRA": "Use LoRA",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "Use LoRA can save GPU memory, but may reduce the quality of the model",
-  "Use filelist": "Use filelist",
-  "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use large for 10G+ GPU, medium for 5G, small for 2G",
-  "VITS Configuration": "VITS Configuration",
-  "VQGAN Configuration": "VQGAN Configuration",
-  "Validation Batch Size": "Validation Batch Size",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "View the status of the preprocessing folder (use the slider to control the depth of the tree)",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.",
-  "WebUI Host": "WebUI Host",
-  "WebUI Port": "WebUI Port",
-  "Whisper Model": "Whisper Model",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU",
-  "latest": "latest",
-  "new": "new",
-  "Realtime Transform Text": "Realtime Transform Text",
-  "Normalization Result Preview (Currently Only Chinese)": "Normalization Result Preview (Currently Only Chinese)",
-  "Text Normalization": "Text Normalization"
-}

+{
+  "16-mixed is recommended for 10+ series GPU": "16-mixed is recommended for 10+ series GPU",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 to 10 seconds of reference audio, useful for specifying speaker.",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).",
+  "Accumulate Gradient Batches": "Accumulate Gradient Batches",
+  "Add to Processing Area": "Add to Processing Area",
+  "Added path successfully!": "Added path successfully!",
+  "Advanced Config": "Advanced Config",
+  "Base LLAMA Model": "Base LLAMA Model",
+  "Batch Inference": "Batch Inference",
+  "Batch Size": "Batch Size",
+  "Changing with the Model Path": "Changing with the Model Path",
+  "Chinese": "Chinese",
+  "Compile Model": "Compile Model",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compile the model can significantly reduce the inference time, but will increase cold start time",
+  "Copy": "Copy",
+  "Data Preprocessing": "Data Preprocessing",
+  "Data Preprocessing Path": "Data Preprocessing Path",
+  "Data Source": "Data Source",
+  "Decoder Model Config": "Decoder Model Config",
+  "Decoder Model Path": "Decoder Model Path",
+  "Disabled": "Disabled",
+  "Enable Reference Audio": "Enable Reference Audio",
+  "English": "English",
+  "Error Message": "Error Message",
+  "File Preprocessing": "File Preprocessing",
+  "Generate": "Generate",
+  "Generated Audio": "Generated Audio",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format",
+  "Infer interface is closed": "Infer interface is closed",
+  "Inference Configuration": "Inference Configuration",
+  "Inference Server Configuration": "Inference Server Configuration",
+  "Inference Server Error": "Inference Server Error",
+  "Inferring interface is launched at {}": "Inferring interface is launched at {}",
+  "Initial Learning Rate": "Initial Learning Rate",
+  "Input Audio & Source Path for Transcription": "Input Audio & Source Path for Transcription",
+  "Input Text": "Input Text",
+  "Invalid path: {}": "Invalid path: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "It is recommended to use CUDA, if you have low configuration, use CPU",
+  "Iterative Prompt Length, 0 means off": "Iterative Prompt Length, 0 means off",
+  "Japanese": "Japanese",
+  "LLAMA Configuration": "LLAMA Configuration",
+  "LLAMA Model Config": "LLAMA Model Config",
+  "LLAMA Model Path": "LLAMA Model Path",
+  "Labeling Device": "Labeling Device",
+  "LoRA Model to be merged": "LoRA Model to be merged",
+  "Maximum Audio Duration": "Maximum Audio Duration",
+  "Maximum Length per Sample": "Maximum Length per Sample",
+  "Maximum Training Steps": "Maximum Training Steps",
+  "Maximum tokens per batch, 0 means no limit": "Maximum tokens per batch, 0 means no limit",
+  "Merge": "Merge",
+  "Merge LoRA": "Merge LoRA",
+  "Merge successfully": "Merge successfully",
+  "Minimum Audio Duration": "Minimum Audio Duration",
+  "Model Output Path": "Model Output Path",
+  "Model Size": "Model Size",
+  "Move": "Move",
+  "Move files successfully": "Move files successfully",
+  "No audio generated, please check the input text.": "No audio generated, please check the input text.",
+  "No selected options": "No selected options",
+  "Number of Workers": "Number of Workers",
+  "Open Inference Server": "Open Inference Server",
+  "Open Labeler WebUI": "Open Labeler WebUI",
+  "Open Tensorboard": "Open Tensorboard",
+  "Opened labeler in browser": "Opened labeler in browser",
+  "Optional Label Language": "Optional Label Language",
+  "Optional online ver": "Optional online ver",
+  "Output Path": "Output Path",
+  "Path error, please check the model file exists in the corresponding path": "Path error, please check the model file exists in the corresponding path",
+  "Precision": "Precision",
+  "Probability of applying Speaker Condition": "Probability of applying Speaker Condition",
+  "Put your text here.": "Put your text here.",
+  "Reference Audio": "Reference Audio",
+  "Reference Text": "Reference Text",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "Related code and weights are released under CC BY-NC-SA 4.0 License.",
+  "Remove Selected Data": "Remove Selected Data",
+  "Removed path successfully!": "Removed path successfully!",
+  "Repetition Penalty": "Repetition Penalty",
+  "Save model every n steps": "Save model every n steps",
+  "Select LLAMA ckpt": "Select LLAMA ckpt",
+  "Select VITS ckpt": "Select VITS ckpt",
+  "Select VQGAN ckpt": "Select VQGAN ckpt",
+  "Select source file processing method": "Select source file processing method",
+  "Select the model to be trained (Depending on the Tab page you are on)": "Select the model to be trained (Depending on the Tab page you are on)",
+  "Selected: {}": "Selected: {}",
+  "Speaker": "Speaker",
+  "Speaker is identified by the folder name": "Speaker is identified by the folder name",
+  "Start Training": "Start Training",
+  "Streaming Audio": "Streaming Audio",
+  "Streaming Generate": "Streaming Generate",
+  "Tensorboard Host": "Tensorboard Host",
+  "Tensorboard Log Path": "Tensorboard Log Path",
+  "Tensorboard Port": "Tensorboard Port",
+  "Tensorboard interface is closed": "Tensorboard interface is closed",
+  "Tensorboard interface is launched at {}": "Tensorboard interface is launched at {}",
+  "Text is too long, please keep it under {} characters.": "Text is too long, please keep it under {} characters.",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.",
+  "Training Configuration": "Training Configuration",
+  "Training Error": "Training Error",
+  "Training stopped": "Training stopped",
+  "Type name of the speaker": "Type name of the speaker",
+  "Type the path or select from the dropdown": "Type the path or select from the dropdown",
+  "Use LoRA": "Use LoRA",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "Use LoRA can save GPU memory, but may reduce the quality of the model",
+  "Use filelist": "Use filelist",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use large for 10G+ GPU, medium for 5G, small for 2G",
+  "VITS Configuration": "VITS Configuration",
+  "VQGAN Configuration": "VQGAN Configuration",
+  "Validation Batch Size": "Validation Batch Size",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "View the status of the preprocessing folder (use the slider to control the depth of the tree)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.",
+  "WebUI Host": "WebUI Host",
+  "WebUI Port": "WebUI Port",
+  "Whisper Model": "Whisper Model",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU",
+  "latest": "latest",
+  "new": "new",
+  "Realtime Transform Text": "Realtime Transform Text",
+  "Normalization Result Preview (Currently Only Chinese)": "Normalization Result Preview (Currently Only Chinese)",
+  "Text Normalization": "Text Normalization",
+  "Select Example Audio": "Select Example Audio"
+}

fish_speech/i18n/locale/es_ES.json CHANGED Viewed

@@ -1,122 +1,123 @@
-{
-  "16-mixed is recommended for 10+ series GPU": "se recomienda 16-mixed para GPU de la serie 10+",
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de audio de referencia, útil para especificar el hablante.",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Un modelo de texto a voz basado en VQ-GAN y Llama desarrollado por [Fish Audio](https://fish.audio).",
-  "Accumulate Gradient Batches": "Acumular lotes de gradientes",
-  "Add to Processing Area": "Agregar al Área de Procesamiento",
-  "Added path successfully!": "¡Ruta agregada exitosamente!",
-  "Advanced Config": "Configuración Avanzada",
-  "Base LLAMA Model": "Modelo Base LLAMA",
-  "Batch Inference": "Inferencia por Lote",
-  "Batch Size": "Tamaño del Lote",
-  "Changing with the Model Path": "Cambiando con la Ruta del Modelo",
-  "Chinese": "Chino",
-  "Compile Model": "Compilar Modelo",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar el modelo puede reducir significativamente el tiempo de inferencia, pero aumentará el tiempo de inicio en frío",
-  "Copy": "Copiar",
-  "Data Preprocessing": "Preprocesamiento de Datos",
-  "Data Preprocessing Path": "Ruta de Preprocesamiento de Datos",
-  "Data Source": "Fuente de Datos",
-  "Decoder Model Config": "Configuración del modelo decodificador",
-  "Decoder Model Path": "Ruta del modelo decodificador",
-  "Disabled": "Desactivado",
-  "Enable Reference Audio": "Habilitar Audio de Referencia",
-  "English": "Inglés",
-  "Error Message": "Mensaje de Error",
-  "File Preprocessing": "Preprocesamiento de Archivos",
-  "Generate": "Generar",
-  "Generated Audio": "Audio Generado",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Si no hay texto correspondiente para el audio, aplique ASR para asistencia, soporte para formato .txt o .lab",
-  "Infer interface is closed": "La interfaz de inferencia está cerrada",
-  "Inference Configuration": "Configuración de Inferencia",
-  "Inference Server Configuration": "Configuración del Servidor de Inferencia",
-  "Inference Server Error": "Error del Servidor de Inferencia",
-  "Inferring interface is launched at {}": "La interfaz de inferencia se ha lanzado en {}",
-  "Initial Learning Rate": "Tasa de Aprendizaje Inicial",
-  "Input Audio & Source Path for Transcription": "Audio de Entrada y Ruta de Origen para Transcripción",
-  "Input Text": "Texto de Entrada",
-  "Invalid path: {}": "Ruta inválida: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "Se recomienda usar CUDA, si tiene una configuración baja, use CPU",
-  "Iterative Prompt Length, 0 means off": "Longitud de la Indicación Iterativa, 0 significa apagado",
-  "Japanese": "Japonés",
-  "LLAMA Configuration": "Configuración de LLAMA",
-  "LLAMA Model Config": "Configuración del Modelo LLAMA",
-  "LLAMA Model Path": "Ruta del Modelo LLAMA",
-  "Labeling Device": "Dispositivo de Etiquetado",
-  "LoRA Model to be merged": "Modelo LoRA a fusionar",
-  "Maximum Audio Duration": "Duración máxima de audio",
-  "Maximum Length per Sample": "Longitud Máxima por Muestra",
-  "Maximum Training Steps": "Pasos Máximos de Entrenamiento",
-  "Maximum tokens per batch, 0 means no limit": "Máximo de tokens por lote, 0 significa sin límite",
-  "Merge": "Fusionar",
-  "Merge LoRA": "Fusionar LoRA",
-  "Merge successfully": "Fusionado exitosamente",
-  "Minimum Audio Duration": "Duración mínima de audio",
-  "Model Output Path": "Ruta de Salida del Modelo",
-  "Model Size": "Tamaño del Modelo",
-  "Move": "Mover",
-  "Move files successfully": "Archivos movidos exitosamente",
-  "No audio generated, please check the input text.": "No se generó audio, por favor verifique el texto de entrada.",
-  "No selected options": "No hay opciones seleccionadas",
-  "Number of Workers": "Número de Trabajadores",
-  "Open Inference Server": "Abrir Servidor de Inferencia",
-  "Open Labeler WebUI": "Abrir Interfaz Web del Etiquetador",
-  "Open Tensorboard": "Abrir Tensorboard",
-  "Opened labeler in browser": "Se abrió el etiquetador en el navegador",
-  "Optional Label Language": "Idioma de Etiquetado Opcional",
-  "Optional online ver": "Ver en línea opcional",
-  "Output Path": "Ruta de Salida",
-  "Path error, please check the model file exists in the corresponding path": "Error de ruta, por favor verifique que el archivo del modelo exista en la ruta correspondiente",
-  "Precision": "Precisión",
-  "Probability of applying Speaker Condition": "Probabilidad de aplicar Condición de Hablante",
-  "Put your text here.": "Ponga su texto aquí.",
-  "Reference Audio": "Audio de Referencia",
-  "Reference Text": "Texto de Referencia",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
-  "Remove Selected Data": "Eliminar Datos Seleccionados",
-  "Removed path successfully!": "¡Ruta eliminada exitosamente!",
-  "Repetition Penalty": "Penalización por Repetición",
-  "Save model every n steps": "Guardar modelo cada n pasos",
-  "Select LLAMA ckpt": "Seleccionar punto de control LLAMA",
-  "Select VITS ckpt": "Seleccionar punto de control VITS",
-  "Select VQGAN ckpt": "Seleccionar punto de control VQGAN",
-  "Select source file processing method": "Seleccione el método de procesamiento de archivos fuente",
-  "Select the model to be trained (Depending on the Tab page you are on)": "Seleccione el modelo a entrenar (Dependiendo de la pestaña en la que se encuentre)",
-  "Selected: {}": "Seleccionado: {}",
-  "Speaker": "Hablante",
-  "Speaker is identified by the folder name": "El hablante se identifica por el nombre de la carpeta",
-  "Start Training": "Iniciar Entrenamiento",
-  "Streaming Audio": "transmisión de audio",
-  "Streaming Generate": "síntesis en flujo",
-  "Tensorboard Host": "Host de Tensorboard",
-  "Tensorboard Log Path": "Ruta de Registro de Tensorboard",
-  "Tensorboard Port": "Puerto de Tensorboard",
-  "Tensorboard interface is closed": "La interfaz de Tensorboard está cerrada",
-  "Tensorboard interface is launched at {}": "La interfaz de Tensorboard se ha lanzado en {}",
-  "Text is too long, please keep it under {} characters.": "El texto es demasiado largo, por favor manténgalo por debajo de {} caracteres.",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "La ruta de la carpeta de entrada a la izquierda o la lista de archivos. Ya sea que esté marcado o no, se utilizará para el entrenamiento posterior en esta lista.",
-  "Training Configuration": "Configuración de Entrenamiento",
-  "Training Error": "Error de Entrenamiento",
-  "Training stopped": "Entrenamiento detenido",
-  "Type name of the speaker": "Escriba el nombre del hablante",
-  "Type the path or select from the dropdown": "Escriba la ruta o seleccione de la lista desplegable",
-  "Use LoRA": "Usar LoRA",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "Usar LoRA puede ahorrar memoria GPU, pero puede reducir la calidad del modelo",
-  "Use filelist": "Usar lista de archivos",
-  "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use grande para GPU de 10G+, mediano para 5G, pequeño para 2G",
-  "VITS Configuration": "Configuración de VITS",
-  "VQGAN Configuration": "Configuración de VQGAN",
-  "Validation Batch Size": "Tamaño del Lote de Validación",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Vea el estado de la carpeta de preprocesamiento (use el control deslizante para controlar la profundidad del árbol)",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "No somos responsables de ningún mal uso del modelo, por favor considere sus leyes y regulaciones locales antes de usarlo.",
-  "WebUI Host": "Host de WebUI",
-  "WebUI Port": "Puerto de WebUI",
-  "Whisper Model": "Modelo Whisper",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Puede encontrar el código fuente [aquí](https://github.com/fishaudio/fish-speech) y los modelos [aquí](https://huggingface.co/fishaudio/fish-speech-1).",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "Se recomienda bf16-true para GPU de la serie 30+, se recomienda 16-mixed para GPU de la serie 10+",
-  "latest": "más reciente",
-  "new": "nuevo",
-  "Realtime Transform Text": "Transformación de Texto en Tiempo Real",
-  "Normalization Result Preview (Currently Only Chinese)": "Vista Previa del Resultado de Normalización (Actualmente Solo Chino)",
-  "Text Normalization": "Normalización de Texto"
-}

+{
+  "16-mixed is recommended for 10+ series GPU": "se recomienda 16-mixed para GPU de la serie 10+",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de audio de referencia, útil para especificar el hablante.",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Un modelo de texto a voz basado en VQ-GAN y Llama desarrollado por [Fish Audio](https://fish.audio).",
+  "Accumulate Gradient Batches": "Acumular lotes de gradientes",
+  "Add to Processing Area": "Agregar al Área de Procesamiento",
+  "Added path successfully!": "¡Ruta agregada exitosamente!",
+  "Advanced Config": "Configuración Avanzada",
+  "Base LLAMA Model": "Modelo Base LLAMA",
+  "Batch Inference": "Inferencia por Lote",
+  "Batch Size": "Tamaño del Lote",
+  "Changing with the Model Path": "Cambiando con la Ruta del Modelo",
+  "Chinese": "Chino",
+  "Compile Model": "Compilar Modelo",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar el modelo puede reducir significativamente el tiempo de inferencia, pero aumentará el tiempo de inicio en frío",
+  "Copy": "Copiar",
+  "Data Preprocessing": "Preprocesamiento de Datos",
+  "Data Preprocessing Path": "Ruta de Preprocesamiento de Datos",
+  "Data Source": "Fuente de Datos",
+  "Decoder Model Config": "Configuración del modelo decodificador",
+  "Decoder Model Path": "Ruta del modelo decodificador",
+  "Disabled": "Desactivado",
+  "Enable Reference Audio": "Habilitar Audio de Referencia",
+  "English": "Inglés",
+  "Error Message": "Mensaje de Error",
+  "File Preprocessing": "Preprocesamiento de Archivos",
+  "Generate": "Generar",
+  "Generated Audio": "Audio Generado",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Si no hay texto correspondiente para el audio, aplique ASR para asistencia, soporte para formato .txt o .lab",
+  "Infer interface is closed": "La interfaz de inferencia está cerrada",
+  "Inference Configuration": "Configuración de Inferencia",
+  "Inference Server Configuration": "Configuración del Servidor de Inferencia",
+  "Inference Server Error": "Error del Servidor de Inferencia",
+  "Inferring interface is launched at {}": "La interfaz de inferencia se ha lanzado en {}",
+  "Initial Learning Rate": "Tasa de Aprendizaje Inicial",
+  "Input Audio & Source Path for Transcription": "Audio de Entrada y Ruta de Origen para Transcripción",
+  "Input Text": "Texto de Entrada",
+  "Invalid path: {}": "Ruta inválida: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "Se recomienda usar CUDA, si tiene una configuración baja, use CPU",
+  "Iterative Prompt Length, 0 means off": "Longitud de la Indicación Iterativa, 0 significa apagado",
+  "Japanese": "Japonés",
+  "LLAMA Configuration": "Configuración de LLAMA",
+  "LLAMA Model Config": "Configuración del Modelo LLAMA",
+  "LLAMA Model Path": "Ruta del Modelo LLAMA",
+  "Labeling Device": "Dispositivo de Etiquetado",
+  "LoRA Model to be merged": "Modelo LoRA a fusionar",
+  "Maximum Audio Duration": "Duración máxima de audio",
+  "Maximum Length per Sample": "Longitud Máxima por Muestra",
+  "Maximum Training Steps": "Pasos Máximos de Entrenamiento",
+  "Maximum tokens per batch, 0 means no limit": "Máximo de tokens por lote, 0 significa sin límite",
+  "Merge": "Fusionar",
+  "Merge LoRA": "Fusionar LoRA",
+  "Merge successfully": "Fusionado exitosamente",
+  "Minimum Audio Duration": "Duración mínima de audio",
+  "Model Output Path": "Ruta de Salida del Modelo",
+  "Model Size": "Tamaño del Modelo",
+  "Move": "Mover",
+  "Move files successfully": "Archivos movidos exitosamente",
+  "No audio generated, please check the input text.": "No se generó audio, por favor verifique el texto de entrada.",
+  "No selected options": "No hay opciones seleccionadas",
+  "Number of Workers": "Número de Trabajadores",
+  "Open Inference Server": "Abrir Servidor de Inferencia",
+  "Open Labeler WebUI": "Abrir Interfaz Web del Etiquetador",
+  "Open Tensorboard": "Abrir Tensorboard",
+  "Opened labeler in browser": "Se abrió el etiquetador en el navegador",
+  "Optional Label Language": "Idioma de Etiquetado Opcional",
+  "Optional online ver": "Ver en línea opcional",
+  "Output Path": "Ruta de Salida",
+  "Path error, please check the model file exists in the corresponding path": "Error de ruta, por favor verifique que el archivo del modelo exista en la ruta correspondiente",
+  "Precision": "Precisión",
+  "Probability of applying Speaker Condition": "Probabilidad de aplicar Condición de Hablante",
+  "Put your text here.": "Ponga su texto aquí.",
+  "Reference Audio": "Audio de Referencia",
+  "Reference Text": "Texto de Referencia",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
+  "Remove Selected Data": "Eliminar Datos Seleccionados",
+  "Removed path successfully!": "¡Ruta eliminada exitosamente!",
+  "Repetition Penalty": "Penalización por Repetición",
+  "Save model every n steps": "Guardar modelo cada n pasos",
+  "Select LLAMA ckpt": "Seleccionar punto de control LLAMA",
+  "Select VITS ckpt": "Seleccionar punto de control VITS",
+  "Select VQGAN ckpt": "Seleccionar punto de control VQGAN",
+  "Select source file processing method": "Seleccione el método de procesamiento de archivos fuente",
+  "Select the model to be trained (Depending on the Tab page you are on)": "Seleccione el modelo a entrenar (Dependiendo de la pestaña en la que se encuentre)",
+  "Selected: {}": "Seleccionado: {}",
+  "Speaker": "Hablante",
+  "Speaker is identified by the folder name": "El hablante se identifica por el nombre de la carpeta",
+  "Start Training": "Iniciar Entrenamiento",
+  "Streaming Audio": "transmisión de audio",
+  "Streaming Generate": "síntesis en flujo",
+  "Tensorboard Host": "Host de Tensorboard",
+  "Tensorboard Log Path": "Ruta de Registro de Tensorboard",
+  "Tensorboard Port": "Puerto de Tensorboard",
+  "Tensorboard interface is closed": "La interfaz de Tensorboard está cerrada",
+  "Tensorboard interface is launched at {}": "La interfaz de Tensorboard se ha lanzado en {}",
+  "Text is too long, please keep it under {} characters.": "El texto es demasiado largo, por favor manténgalo por debajo de {} caracteres.",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "La ruta de la carpeta de entrada a la izquierda o la lista de archivos. Ya sea que esté marcado o no, se utilizará para el entrenamiento posterior en esta lista.",
+  "Training Configuration": "Configuración de Entrenamiento",
+  "Training Error": "Error de Entrenamiento",
+  "Training stopped": "Entrenamiento detenido",
+  "Type name of the speaker": "Escriba el nombre del hablante",
+  "Type the path or select from the dropdown": "Escriba la ruta o seleccione de la lista desplegable",
+  "Use LoRA": "Usar LoRA",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "Usar LoRA puede ahorrar memoria GPU, pero puede reducir la calidad del modelo",
+  "Use filelist": "Usar lista de archivos",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use grande para GPU de 10G+, mediano para 5G, pequeño para 2G",
+  "VITS Configuration": "Configuración de VITS",
+  "VQGAN Configuration": "Configuración de VQGAN",
+  "Validation Batch Size": "Tamaño del Lote de Validación",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Vea el estado de la carpeta de preprocesamiento (use el control deslizante para controlar la profundidad del árbol)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "No somos responsables de ningún mal uso del modelo, por favor considere sus leyes y regulaciones locales antes de usarlo.",
+  "WebUI Host": "Host de WebUI",
+  "WebUI Port": "Puerto de WebUI",
+  "Whisper Model": "Modelo Whisper",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Puede encontrar el código fuente [aquí](https://github.com/fishaudio/fish-speech) y los modelos [aquí](https://huggingface.co/fishaudio/fish-speech-1).",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "Se recomienda bf16-true para GPU de la serie 30+, se recomienda 16-mixed para GPU de la serie 10+",
+  "latest": "más reciente",
+  "new": "nuevo",
+  "Realtime Transform Text": "Transformación de Texto en Tiempo Real",
+  "Normalization Result Preview (Currently Only Chinese)": "Vista Previa del Resultado de Normalización (Actualmente Solo Chino)",
+  "Text Normalization": "Normalización de Texto",
+  "Select Example Audio": "Selecionar áudio de exemplo"
+}

fish_speech/i18n/locale/ja_JP.json CHANGED Viewed

@@ -1,123 +1,123 @@
-{
-  "16-mixed is recommended for 10+ series GPU": "10シリーズ以降のGPUには16-mixedをお勧めします",
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "話者を指定するのに役立つ、5～10秒のリファレンスオーディオ。",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)が開発したVQ-GANとLlamaに基づくテキスト音声合成モデル。",
-  "Accumulate Gradient Batches": "勾配バッチの累積",
-  "Add to Processing Area": "処理エリアに追加",
-  "Added path successfully!": "パスの追加に成功しました！",
-  "Advanced Config": "詳細設定",
-  "Base LLAMA Model": "基本LLAMAモデル",
-  "Batch Inference": "バッチ推論",
-  "Batch Size": "バッチサイズ",
-  "Changing with the Model Path": "モデルのパスに伴って変化する",
-  "Chinese": "中国語",
-  "Compile Model": "モデルのコンパイル",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "モデルをコンパイルすると推論時間を大幅に短縮できますが、コールドスタート時間が長くなります",
-  "Copy": "コピー",
-  "Data Preprocessing": "データ前処理",
-  "Data Preprocessing Path": "データ前処理パス",
-  "Data Source": "データソース",
-  "Decoder Model Config": "デコーダーモデルの構成",
-  "Decoder Model Path": "デコーダーモデルのパス",
-  "Disabled": "無効",
-  "Enable Reference Audio": "リファレンスオーディオを有効にする",
-  "English": "英語",
-  "Error Message": "エラーメッセージ",
-  "File Preprocessing": "文書前处理",
-  "Generate": "生成",
-  "Generated Audio": "生成されたオーディオ",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "音声に対応するテキストがない場合は、ASRを適用してサポートします。.txtまたは.lab形式をサポートしています",
-  "Infer interface is closed": "推論インターフェースが閉じられています",
-  "Inference Configuration": "推論設定",
-  "Inference Server Configuration": "推論サーバー設定",
-  "Inference Server Error": "推論サーバーエラー",
-  "Inferring interface is launched at {}": "推論インターフェースが{}で起動しました",
-  "Initial Learning Rate": "初期学習率",
-  "Input Audio & Source Path for Transcription": "入力オーディオと文字起こしのソースパス",
-  "Input Text": "入力テキスト",
-  "Invalid path: {}": "無効なパス: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "CUDAの使用をお勧めします。低い構成の場合はCPUを使用してください",
-  "Iterative Prompt Length, 0 means off": "反復プロンプト長。0はオフを意味します",
-  "Japanese": "日本語",
-  "LLAMA Configuration": "LLAMA設定",
-  "LLAMA Model Config": "LLAMAモデル設定",
-  "LLAMA Model Path": "LLAMAモデルパス",
-  "Labeling Device": "ラベリングデバイス",
-  "LoRA Model to be merged": "マージするLoRAモデル",
-  "Maximum Audio Duration": "最大オーディオの長さ",
-  "Maximum Length per Sample": "サンプルあたりの最大長",
-  "Maximum Training Steps": "最大トレーニングステップ数",
-  "Maximum tokens per batch, 0 means no limit": "バッチあたりの最大トークン数。0は制限なしを意味します",
-  "Merge": "マージ",
-  "Merge LoRA": "LoRAのマージ",
-  "Merge successfully": "マージに成功しました",
-  "Minimum Audio Duration": "最小オーディオの長さ",
-  "Model Output Path": "モデル出力パス",
-  "Model Size": "モデルサイズ",
-  "Move": "移動",
-  "Move files successfully": "ファイルの移動に成功しました",
-  "No audio generated, please check the input text.": "オーディオが生成されていません。入力テキストを確認してください。",
-  "No selected options": "選択されたオプションはありません",
-  "Number of Workers": "ワーカー数",
-  "Open Inference Server": "推論サーバーを開く",
-  "Open Labeler WebUI": "ラベラーWebUIを開く",
-  "Open Tensorboard": "Tensorboardを開く",
-  "Opened labeler in browser": "ブラウザでラベラーを開きました",
-  "Optional Label Language": "オプションのラベル言語",
-  "Optional online ver": "オプションのオンラインバージョン",
-  "Output Path": "出力パス",
-  "Path error, please check the model file exists in the corresponding path": "パスエラー。対応するパスにモデルファイルが存在するか確認してください",
-  "Precision": "精度",
-  "Probability of applying Speaker Condition": "話者条件を適用する確率",
-  "Put your text here.": "ここにテキストを入力してください。",
-  "Reference Audio": "リファレン���オーディオ",
-  "Reference Text": "リファレンステキスト",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "関連コードと重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
-  "Remove Selected Data": "選択したデータを削除",
-  "Removed path successfully!": "パスの削除に成功しました！",
-  "Repetition Penalty": "反復ペナルティ",
-  "Save model every n steps": "nステップごとにモデルを保存",
-  "Select LLAMA ckpt": " LLAMA チェックポイントを選択",
-  "Select VITS ckpt": "VITS チェックポイントを選択",
-  "Select VQGAN ckpt": "VQGAN チェックポイントを選択",
-  "Select source file processing method": "ソースファイルの処理方法を選択",
-  "Select the model to be trained (Depending on the Tab page you are on)": "タブページに応じてトレーニングするモデルを選択してください",
-  "Selected: {}": "選択済み: {}",
-  "Speaker": "話者",
-  "Speaker is identified by the folder name": "話者はフォルダ名で識別されます",
-  "Start Training": "トレーニング開始",
-  "Streaming Audio": "ストリーミングオーディオ",
-  "Streaming Generate": "ストリーミング合成",
-  "Tensorboard Host": "Tensorboardホスト",
-  "Tensorboard Log Path": "Tensorboardログパス",
-  "Tensorboard Port": "Tensorboardポート",
-  "Tensorboard interface is closed": "Tensorboardインターフェースが閉じられています",
-  "Tensorboard interface is launched at {}": "Tensorboardインターフェースが{}で起動されました",
-  "Text is too long, please keep it under {} characters.": "テキストが長すぎます。{}文字以内に抑えてください。",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左側の入力フォルダまたはファイルリストのパス。チェックの有無にかかわらず、このリストの後続のトレーニングに使用されます。",
-  "Training Configuration": "トレーニング設定",
-  "Training Error": "トレーニングエラー",
-  "Training stopped": "トレーニングが停止しました",
-  "Type name of the speaker": "話者の名前を入力",
-  "Type the path or select from the dropdown": "パスを入力するか、ドロップダウンから選択してください",
-  "Use LoRA": "LoRAを使用",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRAを使用するとGPUメモリを節約できますが、モデルの品質が低下する可能性があります",
-  "Use filelist": "ファイルリストを使用",
-  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G以上のGPUには大、5Gには中、2Gには小を使用してください",
-  "VITS Configuration": "VITS の構成",
-  "VQGAN Configuration": "VQGAN の構成",
-  "Validation Batch Size": "検証バッチサイズ",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "前処理フォルダの状態を表示（スライダーを使用してツリーの深さを制御）",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "モデルの誤用については一切責任を負いません。使用する前に、現地の法律と規制を考慮してください。",
-  "WebUI Host": "WebUIホスト",
-  "WebUI Port": "WebUIポート",
-  "Whisper Model": "Whisperモデル",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "ソースコードは[こちら](https://github.com/fishaudio/fish-speech)、モデルは[こちら](https://huggingface.co/fishaudio/fish-speech-1)にあります。",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30シリーズ以降のGPUにはbf16-trueを、10シリーズ以降のGPUには16-mixedをお勧めします",
-  "latest": "最新",
-  "new": "新規",
-  "Realtime Transform Text": "リアルタイム変換テキスト",
-  "Normalization Result Preview (Currently Only Chinese)": "正規化結果プレビュー（現在は中国語のみ）",
-  "Text Normalization": "テキスト正規化"
-}

+{
+  "16-mixed is recommended for 10+ series GPU": "10シリーズ以降のGPUには16-mixedをお勧めします",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "話者を指定するのに役立つ、5～10秒のリファレンスオーディオ。",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)が開発したVQ-GANとLlamaに基づくテキスト音声合成モデル。",
+  "Accumulate Gradient Batches": "勾配バッチの累積",
+  "Add to Processing Area": "処理エリアに追加",
+  "Added path successfully!": "パスの追加に成功しました！",
+  "Advanced Config": "詳細設定",
+  "Base LLAMA Model": "基本LLAMAモデル",
+  "Batch Inference": "バッチ推論",
+  "Batch Size": "バッチサイズ",
+  "Changing with the Model Path": "モデルのパスに伴って変化する",
+  "Chinese": "中国語",
+  "Compile Model": "モデルのコンパイル",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "モデルをコンパイルすると推論時間を大幅に短縮できますが、コールドスタート時間が長くなります",
+  "Copy": "コピー",
+  "Data Preprocessing": "データ前処理",
+  "Data Preprocessing Path": "データ前処理パス",
+  "Data Source": "データソース",
+  "Decoder Model Config": "デコーダーモデルの構成",
+  "Decoder Model Path": "デコーダーモデルのパス",
+  "Disabled": "無効",
+  "Enable Reference Audio": "リファレンスオーディオを有効にする",
+  "English": "英語",
+  "Error Message": "エラーメッセージ",
+  "File Preprocessing": "文書前处理",
+  "Generate": "生成",
+  "Generated Audio": "生成されたオーディオ",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "音声に対応するテキストがない場合は、ASRを適用してサポートします。.txtまたは.lab形式をサポートしています",
+  "Infer interface is closed": "推論インターフェースが閉じられています",
+  "Inference Configuration": "推論設定",
+  "Inference Server Configuration": "推論サーバー設定",
+  "Inference Server Error": "推論サーバーエラー",
+  "Inferring interface is launched at {}": "推論インターフェースが{}で起動しました",
+  "Initial Learning Rate": "初期学習率",
+  "Input Audio & Source Path for Transcription": "入力オーディオと文字起こしのソースパス",
+  "Input Text": "入力テキスト",
+  "Invalid path: {}": "無効なパス: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "CUDAの使用をお勧めします。低い構成の場合はCPUを使用してください",
+  "Iterative Prompt Length, 0 means off": "反復プロンプト長。0はオフを意味します",
+  "Japanese": "日本語",
+  "LLAMA Configuration": "LLAMA設定",
+  "LLAMA Model Config": "LLAMAモデル設定",
+  "LLAMA Model Path": "LLAMAモデルパス",
+  "Labeling Device": "ラベリングデバイス",
+  "LoRA Model to be merged": "マージするLoRAモデル",
+  "Maximum Audio Duration": "最大オーディオの長さ",
+  "Maximum Length per Sample": "サンプルあたりの最大長",
+  "Maximum Training Steps": "最大トレーニングステップ数",
+  "Maximum tokens per batch, 0 means no limit": "バッチあたりの最大トークン数。0は制限なしを意味します",
+  "Merge": "マージ",
+  "Merge LoRA": "LoRAのマージ",
+  "Merge successfully": "マージに成功しました",
+  "Minimum Audio Duration": "最小オーディオの長さ",
+  "Model Output Path": "モデル出力パス",
+  "Model Size": "モデルサイズ",
+  "Move": "移動",
+  "Move files successfully": "ファイルの移動に成功しました",
+  "No audio generated, please check the input text.": "オーディオが生成されていません。入力テキストを確認してください。",
+  "No selected options": "選択されたオプションはありません",
+  "Number of Workers": "ワーカー数",
+  "Open Inference Server": "推論サーバーを開く",
+  "Open Labeler WebUI": "ラベラーWebUIを開く",
+  "Open Tensorboard": "Tensorboardを開く",
+  "Opened labeler in browser": "ブラウザでラベラーを開きました",
+  "Optional Label Language": "オプションのラベル言語",
+  "Optional online ver": "オプションのオンラインバージョン",
+  "Output Path": "出力パス",
+  "Path error, please check the model file exists in the corresponding path": "パスエラー。対応するパスにモデルファイルが存在するか確認してください",
+  "Precision": "精度",
+  "Probability of applying Speaker Condition": "話者条件を適用する確率",
+  "Put your text here.": "ここにテキストを入力してください。",
+  "Reference Audio": "リファレンスオーディオ",
+  "Reference Text": "リファレンステキスト",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "関連コードと重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
+  "Remove Selected Data": "選択したデータを削除",
+  "Removed path successfully!": "パスの削除に成功しました！",
+  "Repetition Penalty": "反復ペナルティ",
+  "Save model every n steps": "nステップごとにモデルを保存",
+  "Select LLAMA ckpt": " LLAMA チェックポイントを選択",
+  "Select VITS ckpt": "VITS チェックポイントを選択",
+  "Select VQGAN ckpt": "VQGAN チェックポイントを選択",
+  "Select source file processing method": "ソースファイルの処理方法を選択",
+  "Select the model to be trained (Depending on the Tab page you are on)": "タブページに応じてトレーニングするモデルを選択してください",
+  "Selected: {}": "選択済み: {}",
+  "Speaker": "話者",
+  "Speaker is identified by the folder name": "話者はフォルダ名で識別されます",
+  "Start Training": "トレーニング開始",
+  "Streaming Audio": "ストリーミングオーディオ",
+  "Streaming Generate": "ストリーミング合成",
+  "Tensorboard Host": "Tensorboardホスト",
+  "Tensorboard Log Path": "Tensorboardログパス",
+  "Tensorboard Port": "Tensorboardポート",
+  "Tensorboard interface is closed": "Tensorboardインターフェースが閉じられています",
+  "Tensorboard interface is launched at {}": "Tensorboardインターフェースが{}で起動されました",
+  "Text is too long, please keep it under {} characters.": "テキストが長すぎます。{}文字以内に抑えてください。",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左側の入力フォルダまたはファイルリストのパス。チェックの有無にかかわらず、このリストの後続のトレーニングに使用されます。",
+  "Training Configuration": "トレーニング設定",
+  "Training Error": "トレーニングエラー",
+  "Training stopped": "トレーニングが停止しました",
+  "Type name of the speaker": "話者の名前を入力",
+  "Type the path or select from the dropdown": "パスを入力するか、ドロップダウンから選択してください",
+  "Use LoRA": "LoRAを使用",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRAを使用するとGPUメモリを節約できますが、モデルの品質が低下する可能性があります",
+  "Use filelist": "ファイルリストを使用",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G以上のGPUには大、5Gには中、2Gには小を使用してください",
+  "VITS Configuration": "VITS の構成",
+  "VQGAN Configuration": "VQGAN の構成",
+  "Validation Batch Size": "検証バッチサイズ",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "前処理フォルダの状態を表示（スライダーを使用してツリーの深さを制御）",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "モデルの誤用については一切責任を負いません。使用する前に、現地の法律と規制を考慮してください。",
+  "WebUI Host": "WebUIホスト",
+  "WebUI Port": "WebUIポート",
+  "Whisper Model": "Whisperモデル",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "ソースコードは[こちら](https://github.com/fishaudio/fish-speech)、モデルは[こちら](https://huggingface.co/fishaudio/fish-speech-1)にあります。",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30シリーズ以降のGPUにはbf16-trueを、10シリーズ以降のGPUには16-mixedをお勧めします",
+  "latest": "最新",
+  "new": "新規",
+  "Realtime Transform Text": "リアルタイム変換テキスト",
+  "Normalization Result Preview (Currently Only Chinese)": "正規化結果プレビュー（現在は中国語のみ）",
+  "Text Normalization": "テキスト正規化",
+  "Select Example Audio": "サンプル音声を選択"
+}

fish_speech/i18n/locale/ko_KR.json ADDED Viewed

	@@ -0,0 +1,123 @@

+{
+  "16-mixed is recommended for 10+ series GPU": "10+ 시리즈 GPU에는 16-mixed를 권장합니다.",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "화자를 특정하는 데 유의미한 5~10초의 길이의 참조 오디오 데이터.",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)에서 개발한 VQ-GAN 및 Llama 기반의 텍스트 음성 변환 모델.",
+  "Accumulate Gradient Batches": "그라디언트 배치 누적",
+  "Add to Processing Area": "처리 영역에 추가",
+  "Added path successfully!": "경로가 성공적으로 추가되었습니다!",
+  "Advanced Config": "고급 설정",
+  "Base LLAMA Model": "기본 LLAMA 모델",
+  "Batch Inference": "배치 추론",
+  "Batch Size": "배치 크기",
+  "Changing with the Model Path": "모델 경로에 따라 변경 중",
+  "Chinese": "중국어",
+  "Compile Model": "모델 컴파일",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "모델을 컴파일하면 추론 시간이 크게 줄어들지만, 초기 시작 시간이 길어집니다.",
+  "Copy": "복사",
+  "Data Preprocessing": "데이터 전처리",
+  "Data Preprocessing Path": "데이터 전처리 경로",
+  "Data Source": "데이터 소스",
+  "Decoder Model Config": "디코더 모델 설정",
+  "Decoder Model Path": "디코더 모델 경로",
+  "Disabled": "비활성화 됨",
+  "Enable Reference Audio": "참고 음성 활성화",
+  "English": "영어",
+  "Error Message": "오류 메시지",
+  "File Preprocessing": "파일 전처리",
+  "Generate": "생성",
+  "Generated Audio": "생성된 오디오",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "오디오애 대응하는 텍스트가 없을 경우, ASR을 적용해 지원하며, .txt 또는 .lab 형식을 지원합니다.",
+  "Infer interface is closed": "추론 인터페이스가 닫혔습니다.",
+  "Inference Configuration": "추론 설정",
+  "Inference Server Configuration": "추론 서버 설정",
+  "Inference Server Error": "추론 서버 오류",
+  "Inferring interface is launched at {}": "추론 인터페이스가 {}에서 시작되었습니다.",
+  "Initial Learning Rate": "초기 학습률",
+  "Input Audio & Source Path for Transcription": "전사할 입력 오디오 및 소스 경로",
+  "Input Text": "입력 텍스트",
+  "Invalid path: {}": "유효하지 않은 경로: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "CUDA 사용을 권장하며, 낮은 사양일 경우 CPU를 사용하는 것을 권장합니다.",
+  "Iterative Prompt Length, 0 means off": "반복 프롬프트 길이. (0:비활성화)",
+  "Japanese": "일본어",
+  "LLAMA Configuration": "LLAMA 설정",
+  "LLAMA Model Config": "LLAMA 모델 설정",
+  "LLAMA Model Path": "LLAMA 모델 경로",
+  "Labeling Device": "라벨링 장치",
+  "LoRA Model to be merged": "병합할 LoRA 모델",
+  "Maximum Audio Duration": "최대 오디오 길이",
+  "Maximum Length per Sample": "샘플당 최대 길이",
+  "Maximum Training Steps": "최대 학습 단계",
+  "Maximum tokens per batch, 0 means no limit": "배치당 최대 토큰 수(0:제한 없음)",
+  "Merge": "병합",
+  "Merge LoRA": "LoRA 병합",
+  "Merge successfully": "성공적으로 병합 되었습니다.",
+  "Minimum Audio Duration": "최소 오디오 길이",
+  "Model Output Path": "모델 출력 경로",
+  "Model Size": "모델 크기",
+  "Move": "이동",
+  "Move files successfully": "파일이 성공적으로 이동되었습니다.",
+  "No audio generated, please check the input text.": "생성된 오디오가 없습니다. 입력된 텍스트를 확인하세요.",
+  "No selected options": "옵션이 선택되지 않았습니다.",
+  "Number of Workers": "작업자 수",
+  "Open Inference Server": "추론 서버 열기",
+  "Open Labeler WebUI": "라벨러 WebUI 열기",
+  "Open Tensorboard": "Tensorboard 열기",
+  "Opened labeler in browser": "브라우저에서 라벨러가 열렸습니다.",
+  "Optional Label Language": "선택적 라벨 언어",
+  "Optional online ver": "온라인 버전 선택",
+  "Output Path": "출력 경로",
+  "Path error, please check the model file exists in the corresponding path": "경로 오류, 해당 경로에 모델 파일이 있는지 확인하십시오.",
+  "Precision": "정밀도",
+  "Probability of applying Speaker Condition": "화자 조건 적용 확률",
+  "Put your text here.": "여기에 텍스트를 입력하세요.",
+  "Reference Audio": "참고 오디오",
+  "Reference Text": "참고 텍스트",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "관련 코드 및 가중치는 CC BY-NC-SA 4.0 라이선스 하에 배포됩니다.",
+  "Remove Selected Data": "선택한 데이터 제거",
+  "Removed path successfully!": "��로가 성공적으로 제거되었습니다!",
+  "Repetition Penalty": "반복 패널티",
+  "Save model every n steps": "n 단계마다 모델 저장",
+  "Select LLAMA ckpt": "LLAMA ckpt 선택",
+  "Select VITS ckpt": "VITS ckpt 선택",
+  "Select VQGAN ckpt": "VQGAN ckpt 선택",
+  "Select source file processing method": "소스 파일 처리 방법 선택",
+  "Select the model to be trained (Depending on the Tab page you are on)": "학습할 모델 선택(탭 페이지에 따라 다름)",
+  "Selected: {}": "선택됨: {}",
+  "Speaker": "화자",
+  "Speaker is identified by the folder name": "화자는 폴더 이름으로 식별됩니다",
+  "Start Training": "학습 시작",
+  "Streaming Audio": "스트리밍 오디오",
+  "Streaming Generate": "스트리밍 생성",
+  "Tensorboard Host": "Tensorboard 호스트",
+  "Tensorboard Log Path": "Tensorboard 로그 경로",
+  "Tensorboard Port": "Tensorboard 포트",
+  "Tensorboard interface is closed": "Tensorboard 인터페이스가 닫혔습니다",
+  "Tensorboard interface is launched at {}": "Tensorboard 인터페이스가 {}에서 시작되었습니다.",
+  "Text is too long, please keep it under {} characters.": "텍스트가 너무 깁니다. {}자 이하로 입력해주세요.",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "왼쪽의 입력 폴더 경로 또는 파일 목록의 경로. 체크 여부에 관계없이 이 목록에서 후속 학습에 사용됩니다.",
+  "Training Configuration": "학습 설정",
+  "Training Error": "학습 오류",
+  "Training stopped": "학습이 중지되었습니다.",
+  "Type name of the speaker": "화자의 이름을 입력하세요.",
+  "Type the path or select from the dropdown": "경로를 입력하거나 드롭다운에서 선택하세요.",
+  "Use LoRA": "LoRA 사용",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRA를 사용하면 GPU 메모리를 절약할 수 있지만, 모델의 품질이 저하될 수 있습니다.",
+  "Use filelist": "파일 목록 사용",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 환경에선 large, 5G에선 medium, 2G에선 small을 사용할 것을 권장합니다.",
+  "VITS Configuration": "VITS 설정",
+  "VQGAN Configuration": "VQGAN 설정",
+  "Validation Batch Size": "검증 배치 크기",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "전처리 폴더의 상태를 확인합니다(슬라이더를 사용하여 트리의 깊이를 조절합니다)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "모델의 오용에 대해 책임지지 않습니다. 사용하기 전에 현지 법률과 규정을 고려하시길 바랍니다.",
+  "WebUI Host": "WebUI 호스트",
+  "WebUI Port": "WebUI 포트",
+  "Whisper Model": "Whisper 모델",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "소스 코드는 [이곳](https://github.com/fishaudio/fish-speech)에서, 모델은 [이곳](https://huggingface.co/fishaudio/fish-speech-1)에서 확인하실 수 있습니다.",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 시리즈 GPU에는 bf16-true를, 10+ 시리즈 GPU에는 16-mixed를 권장합니다",
+  "latest": "최신",
+  "new": "새로운",
+  "Realtime Transform Text": "실시간 텍스트 변환",
+  "Normalization Result Preview (Currently Only Chinese)": "정규화 결과 미리보기(현재 중국어만 지원)",
+  "Text Normalization": "텍스트 정규화",
+  "Select Example Audio": "예시 오디오 선택"
+}

fish_speech/i18n/locale/pt_BR.json CHANGED Viewed

@@ -1,133 +1,133 @@
-{
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de áudio de referência, útil para especificar o orador.",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Um modelo de texto para fala baseado em VQ-GAN e Llama desenvolvido por [Fish Audio](https://fish.audio).",
-  "Accumulate Gradient Batches": "Acumular Lotes de Gradiente",
-  "Add to Processing Area": "Adicionar à Área de Processamento",
-  "Added path successfully!": "Caminho adicionado com sucesso!",
-  "Advanced Config": "Configuração Avançada",
-  "Base LLAMA Model": "Modelo LLAMA Base",
-  "Batch Inference": "Inferência em Lote",
-  "Batch Size": "Tamanho do Lote",
-  "Changing with the Model Path": "Alterando com o Caminho do Modelo",
-  "Compile Model": "Compilar Modelo",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar o modelo pode reduzir significativamente o tempo de inferência, mas aumentará a latência inicial",
-  "Copy": "Copiar",
-  "Data Preprocessing": "Pré-processamento de Dados",
-  "Data Preprocessing Path": "Caminho de Pré-processamento de Dados",
-  "Data Source": "Fonte de Dados",
-  "Decoder Model Config": "Configuração do Modelo Decodificador",
-  "Decoder Model Path": "Caminho do Modelo Decodificador",
-  "Disabled": "Desativado",
-  "Enable Initial Prompt": "Habilitar Prompt Inicial",
-  "Enable Reference Audio": "Habilitar Áudio de Referência",
-  "English": "Inglês",
-  "Japanese": "Japonês",
-  "Chinese": "Chinês",
-  "Portuguese": "Português",
-  "Spanish": "Espanhol",
-  "Error Message": "Mensagem de Erro",
-  "Faster Whisper, Up to 5g GPU memory usage": "Faster Whisper (Usa até 5 GB de vRAM)",
-  "File Preprocessing": "Pré-processamento de Arquivos",
-  "Generate": "Gerar",
-  "Generated Audio": "Áudio Gerado",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Se não houver texto correspondente ao áudio, utilize o ASR para assistência (formatos .txt ou .lab)",
-  "Infer interface is closed": "A interface de inferência foi fechada",
-  "Inference Configuration": "Configuração de Inferência",
-  "Inference Server Configuration": "Configuração do Servidor de Inferência",
-  "Inference Server Error": "Erro do Servidor de Inferência",
-  "Inferring interface is launched at {}": "A interface de inferência foi iniciada em {}",
-  "Initial Learning Rate": "Taxa de Aprendizagem Inicial",
-  "Initial Prompt": "Prompt Inicial",
-  "Initial prompt can provide contextual or vocabulary-specific guidance to the model.": "O prompt inicial pode fornecer orientação contextual ou específica de vocabulário para o modelo.",
-  "Input Audio & Source Path for Transcription": "Entrada de Áudio/Caminho de Origem para Transcrição",
-  "Input Text": "Texto de Entrada",
-  "Invalid path: {}": "Caminho inválido: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "Para GPUs Nvidia é recomendado usar CUDA. Se não tiver uma GPU Nvidia, use CPU",
-  "Iterative Prompt Length, 0 means off": "Comprimento do Prompt Iterativo (0 = desativado)",
-  "LLAMA Configuration": "Configuração do LLAMA",
-  "LLAMA Model Config": "Configuração do Modelo LLAMA",
-  "LLAMA Model Path": "Caminho do Modelo LLAMA",
-  "Labeling Device": "Dispositivo de Rotulagem",
-  "LoRA Model to be merged": "Modelo LoRA para mesclagem",
-  "Maximum Length per Sample": "Comprimento Máximo por Amostra",
-  "Maximum Training Steps": "Etapas Máximas de Treinamento",
-  "Maximum tokens per batch, 0 means no limit": "Número máximo de tokens por lote, 0 significa sem limite",
-  "Merge": "Mesclar",
-  "Merge LoRA": "Mesclar LoRA",
-  "Merge successfully": "Mesclado com sucesso",
-  "Model Output Path": "Caminho de Saída do Modelo",
-  "Model Quantization": "Quantização do Modelo",
-  "Model Size": "Tamanho do Modelo",
-  "Move": "Mover",
-  "Move files successfully": "Arquivos movidos com sucesso",
-  "No audio generated, please check the input text.": "Nenhum áudio gerado, verifique o texto de entrada.",
-  "No selected options": "Nenhuma opção selecionada",
-  "Normalization Result Preview (Currently Only Chinese)": "Pré-visualização do Resultado da Normalização (Atualmente Apenas Chinês)",
-  "Number of Workers": "Número de Processos",
-  "Open Inference Server": "Abrir Servidor de Inferência",
-  "Open Labeler WebUI": "Abrir WebUI de Rotulagem",
-  "Open Tensorboard": "Abrir Tensorboard",
-  "Opened labeler in browser": "WebUI de rotulagem aberta no navegador",
-  "Optional Label Language": "Idioma do Rótulo (Opcional)",
-  "Optional online ver": "Versão online (opcional)",
-  "Output Path": "Caminho de Saída",
-  "Path error, please check the model file exists in the corresponding path": "Erro de caminho, verifique se o arquivo do modelo existe no caminho correspondente",
-  "Post-quantification Precision": "Precisão Pós-quantização",
-  "Precision": "Precisão",
-  "Probability of applying Speaker Condition": "Probabilidade de Aplicar Condição de Orador",
-  "Put your text here.": "Insira seu texto aqui.",
-  "Quantify": "Quantizar",
-  "Quantify successfully": "Quantizado com sucesso",
-  "Realtime Transform Text": "Transformar Texto em Tempo Real",
-  "Reference Audio": "Áudio de Referência",
-  "Reference Text": "Texto de Referência",
-  "warning": "Aviso",
-  "Pre-processing begins...": "O pré-processamento começou!",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado e os pesos são licenciados sob a Licença CC BY-NC-SA 4.0.",
-  "Remove Selected Data": "Remover Dados Selecionados",
-  "Removed path successfully!": "Caminho removido com sucesso!",
-  "Repetition Penalty": "Penalidade de Repetição",
-  "Save model every n steps": "Salvar modelo a cada n etapas",
-  "Select LLAMA ckpt": "Selecionar .ckpt do LLAMA",
-  "Select source file processing method": "Escolha como processar o arquivo de origem",
-  "Select the model to be trained (Depending on the Tab page you are on)": "Selecione o modelo para o treinamento (dependendo da aba em que você está)",
-  "Selected: {}": "Selecionado: {}",
-  "Speaker is identified by the folder name": "O orador é identificado pelo nome da pasta",
-  "Start Training": "Iniciar Treinamento",
-  "Streaming Audio": "Áudio em Streaming",
-  "Streaming Generate": "Geração em Streaming",
-  "Tensorboard Host": "Host do Tensorboard",
-  "Tensorboard Log Path": "Caminho de Log do Tensorboard",
-  "Tensorboard Port": "Porta do Tensorboard",
-  "Tensorboard interface is closed": "A interface do Tensorboard está fechada",
-  "Tensorboard interface is launched at {}": "A interface do Tensorboard foi iniciada em {}",
-  "Text Normalization": "Normalização de Texto",
-  "Text is too long, please keep it under {} characters.": "O texto é muito longo. Mantenha-o com menos de {} caracteres.",
-  "The lower the quantitative precision, the more the effectiveness may decrease, but the greater the efficiency will increase": "Quanto menor a precisão quantitativa, mais a eficácia pode diminuir, mas maior será o aumento da eficiência",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "O caminho da pasta de entrada à esquerda ou a lista de arquivos. Independentemente de estar marcada ou não, ela será utilizada para o treinamento subsequente nesta lista.",
-  "Training Configuration": "Configuração de Treinamento",
-  "Training Error": "Erro de Treinamento",
-  "Training stopped": "Treinamento interrompido!",
-  "Type the path or select from the dropdown": "Digite o caminho ou selecione no menu suspenso",
-  "Use LoRA": "Usar LoRA",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "O uso de LoRAs pode economizar memória da GPU, mas também pode reduzir a qualidade",
-  "Use filelist": "Usar lista de arquivos",
-  "VQGAN Configuration": "Configuração do VQGAN",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Visualizar o status da pasta de pré-processamento (use o controle deslizante para controlar a profundidade da árvore)",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "Não nos responsabilizamos por qualquer uso indevido do modelo. Por favor, considere as leis e regulamentações locais antes de usá-lo.",
-  "WebUI Host": "Host da WebUI",
-  "WebUI Port": "Porta da WebUI",
-  "Whisper Model": "Modelo Whisper",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Você pode encontrar o código fonte [aqui](https://github.com/fishaudio/fish-speech) e os modelos [aqui](https://huggingface.co/fishaudio/fish-speech-1).",
-  "auto": "automático",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true é recomendado para GPUs da série 30+, 16-mixed é recomendado para GPUs da série 10+",
-  "latest": "mais recente",
-  "new": "novo",
-  "This audio introduces the basic concepts and applications of artificial intelligence and machine learning.": "Este áudio introduz os conceitos básicos e aplicações de inteligência artificial e aprendizado de máquina.",
-  "You don't need to train this model!": "Não é necessário treinar este modelo!",
-  "Yes": "Sim",
-  "No": "Não",
-  "version:": "versão:",
-  "author:": "autor:"
-}

+{
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de áudio de referência, útil para especificar o orador.",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Um modelo de texto para fala baseado em VQ-GAN e Llama desenvolvido por [Fish Audio](https://fish.audio).",
+  "Accumulate Gradient Batches": "Acumular Lotes de Gradiente",
+  "Add to Processing Area": "Adicionar à Área de Processamento",
+  "Added path successfully!": "Caminho adicionado com sucesso!",
+  "Advanced Config": "Configuração Avançada",
+  "Base LLAMA Model": "Modelo LLAMA Base",
+  "Batch Inference": "Inferência em Lote",
+  "Batch Size": "Tamanho do Lote",
+  "Changing with the Model Path": "Alterando com o Caminho do Modelo",
+  "Compile Model": "Compilar Modelo",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar o modelo pode reduzir significativamente o tempo de inferência, mas aumentará a latência inicial",
+  "Copy": "Copiar",
+  "Data Preprocessing": "Pré-processamento de Dados",
+  "Data Preprocessing Path": "Caminho de Pré-processamento de Dados",
+  "Data Source": "Fonte de Dados",
+  "Decoder Model Config": "Configuração do Modelo Decodificador",
+  "Decoder Model Path": "Caminho do Modelo Decodificador",
+  "Disabled": "Desativado",
+  "Enable Initial Prompt": "Habilitar Prompt Inicial",
+  "Enable Reference Audio": "Habilitar Áudio de Referência",
+  "English": "Inglês",
+  "Japanese": "Japonês",
+  "Chinese": "Chinês",
+  "Portuguese": "Português",
+  "Spanish": "Espanhol",
+  "Error Message": "Mensagem de Erro",
+  "Faster Whisper, Up to 5g GPU memory usage": "Faster Whisper (Usa até 5 GB de vRAM)",
+  "File Preprocessing": "Pré-processamento de Arquivos",
+  "Generate": "Gerar",
+  "Generated Audio": "Áudio Gerado",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Se não houver texto correspondente ao áudio, utilize o ASR para assistência (formatos .txt ou .lab)",
+  "Infer interface is closed": "A interface de inferência foi fechada",
+  "Inference Configuration": "Configuração de Inferência",
+  "Inference Server Configuration": "Configuração do Servidor de Inferência",
+  "Inference Server Error": "Erro do Servidor de Inferência",
+  "Inferring interface is launched at {}": "A interface de inferência foi iniciada em {}",
+  "Initial Learning Rate": "Taxa de Aprendizagem Inicial",
+  "Initial Prompt": "Prompt Inicial",
+  "Initial prompt can provide contextual or vocabulary-specific guidance to the model.": "O prompt inicial pode fornecer orientação contextual ou específica de vocabulário para o modelo.",
+  "Input Audio & Source Path for Transcription": "Entrada de Áudio/Caminho de Origem para Transcrição",
+  "Input Text": "Texto de Entrada",
+  "Invalid path: {}": "Caminho inválido: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "Para GPUs Nvidia é recomendado usar CUDA. Se não tiver uma GPU Nvidia, use CPU",
+  "Iterative Prompt Length, 0 means off": "Comprimento do Prompt Iterativo (0 = desativado)",
+  "LLAMA Configuration": "Configuração do LLAMA",
+  "LLAMA Model Config": "Configuração do Modelo LLAMA",
+  "LLAMA Model Path": "Caminho do Modelo LLAMA",
+  "Labeling Device": "Dispositivo de Rotulagem",
+  "LoRA Model to be merged": "Modelo LoRA para mesclagem",
+  "Maximum Length per Sample": "Comprimento Máximo por Amostra",
+  "Maximum Training Steps": "Etapas Máximas de Treinamento",
+  "Maximum tokens per batch, 0 means no limit": "Número máximo de tokens por lote, 0 significa sem limite",
+  "Merge": "Mesclar",
+  "Merge LoRA": "Mesclar LoRA",
+  "Merge successfully": "Mesclado com sucesso",
+  "Model Output Path": "Caminho de Saída do Modelo",
+  "Model Quantization": "Quantização do Modelo",
+  "Model Size": "Tamanho do Modelo",
+  "Move": "Mover",
+  "Move files successfully": "Arquivos movidos com sucesso",
+  "No audio generated, please check the input text.": "Nenhum áudio gerado, verifique o texto de entrada.",
+  "No selected options": "Nenhuma opção selecionada",
+  "Normalization Result Preview (Currently Only Chinese)": "Pré-visualização do Resultado da Normalização (Atualmente Apenas Chinês)",
+  "Number of Workers": "Número de Processos",
+  "Open Inference Server": "Abrir Servidor de Inferência",
+  "Open Labeler WebUI": "Abrir WebUI de Rotulagem",
+  "Open Tensorboard": "Abrir Tensorboard",
+  "Opened labeler in browser": "WebUI de rotulagem aberta no navegador",
+  "Optional Label Language": "Idioma do Rótulo (Opcional)",
+  "Optional online ver": "Versão online (opcional)",
+  "Output Path": "Caminho de Saída",
+  "Path error, please check the model file exists in the corresponding path": "Erro de caminho, verifique se o arquivo do modelo existe no caminho correspondente",
+  "Post-quantification Precision": "Precisão Pós-quantização",
+  "Precision": "Precisão",
+  "Probability of applying Speaker Condition": "Probabilidade de Aplicar Condição de Orador",
+  "Put your text here.": "Insira seu texto aqui.",
+  "Quantify": "Quantizar",
+  "Quantify successfully": "Quantizado com sucesso",
+  "Realtime Transform Text": "Transformar Texto em Tempo Real",
+  "Reference Audio": "Áudio de Referência",
+  "Reference Text": "Texto de Referência",
+  "warning": "Aviso",
+  "Pre-processing begins...": "O pré-processamento começou!",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado e os pesos são licenciados sob a Licença CC BY-NC-SA 4.0.",
+  "Remove Selected Data": "Remover Dados Selecionados",
+  "Removed path successfully!": "Caminho removido com sucesso!",
+  "Repetition Penalty": "Penalidade de Repetição",
+  "Save model every n steps": "Salvar modelo a cada n etapas",
+  "Select LLAMA ckpt": "Selecionar .ckpt do LLAMA",
+  "Select source file processing method": "Escolha como processar o arquivo de origem",
+  "Select the model to be trained (Depending on the Tab page you are on)": "Selecione o modelo para o treinamento (dependendo da aba em que você está)",
+  "Selected: {}": "Selecionado: {}",
+  "Speaker is identified by the folder name": "O orador é identificado pelo nome da pasta",
+  "Start Training": "Iniciar Treinamento",
+  "Streaming Audio": "Áudio em Streaming",
+  "Streaming Generate": "Geração em Streaming",
+  "Tensorboard Host": "Host do Tensorboard",
+  "Tensorboard Log Path": "Caminho de Log do Tensorboard",
+  "Tensorboard Port": "Porta do Tensorboard",
+  "Tensorboard interface is closed": "A interface do Tensorboard está fechada",
+  "Tensorboard interface is launched at {}": "A interface do Tensorboard foi iniciada em {}",
+  "Text Normalization": "Normalização de Texto",
+  "Text is too long, please keep it under {} characters.": "O texto é muito longo. Mantenha-o com menos de {} caracteres.",
+  "The lower the quantitative precision, the more the effectiveness may decrease, but the greater the efficiency will increase": "Quanto menor a precisão quantitativa, mais a eficácia pode diminuir, mas maior será o aumento da eficiência",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "O caminho da pasta de entrada à esquerda ou a lista de arquivos. Independentemente de estar marcada ou não, ela será utilizada para o treinamento subsequente nesta lista.",
+  "Training Configuration": "Configuração de Treinamento",
+  "Training Error": "Erro de Treinamento",
+  "Training stopped": "Treinamento interrompido!",
+  "Type the path or select from the dropdown": "Digite o caminho ou selecione no menu suspenso",
+  "Use LoRA": "Usar LoRA",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "O uso de LoRAs pode economizar memória da GPU, mas também pode reduzir a qualidade",
+  "Use filelist": "Usar lista de arquivos",
+  "VQGAN Configuration": "Configuração do VQGAN",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Visualizar o status da pasta de pré-processamento (use o controle deslizante para controlar a profundidade da árvore)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "Não nos responsabilizamos por qualquer uso indevido do modelo. Por favor, considere as leis e regulamentações locais antes de usá-lo.",
+  "WebUI Host": "Host da WebUI",
+  "WebUI Port": "Porta da WebUI",
+  "Whisper Model": "Modelo Whisper",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Você pode encontrar o código fonte [aqui](https://github.com/fishaudio/fish-speech) e os modelos [aqui](https://huggingface.co/fishaudio/fish-speech-1).",
+  "auto": "automático",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true é recomendado para GPUs da série 30+, 16-mixed é recomendado para GPUs da série 10+",
+  "latest": "mais recente",
+  "new": "novo",
+  "This audio introduces the basic concepts and applications of artificial intelligence and machine learning.": "Este áudio introduz os conceitos básicos e aplicações de inteligência artificial e aprendizado de máquina.",
+  "You don't need to train this model!": "Não é necessário treinar este modelo!",
+  "Yes": "Sim",
+  "No": "Não",
+  "version:": "versão:",
+  "author:": "autor:"
+}

fish_speech/i18n/locale/zh_CN.json CHANGED Viewed

@@ -1,122 +1,123 @@
-{
-  "16-mixed is recommended for 10+ series GPU": "10+ 系列 GPU 建议使用 16-mixed",
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 到 10 秒的参考音频，适用于指定音色。",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.",
-  "Accumulate Gradient Batches": "梯度累积批次",
-  "Add to Processing Area": "加入处理区",
-  "Added path successfully!": "添加路径成功!",
-  "Advanced Config": "高级参数",
-  "Base LLAMA Model": "基础 LLAMA 模型",
-  "Batch Inference": "批量推理",
-  "Batch Size": "批次大小",
-  "Changing with the Model Path": "随模型路径变化",
-  "Chinese": "中文",
-  "Compile Model": "编译模型",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "编译模型可以显著减少推理时间，但会增加冷启动时间",
-  "Copy": "复制",
-  "Data Preprocessing": "数据预处理",
-  "Data Preprocessing Path": "数据预处理路径",
-  "Data Source": "数据源",
-  "Decoder Model Config": "解码器模型配置",
-  "Decoder Model Path": "解码器模型路径",
-  "Disabled": "禁用",
-  "Enable Reference Audio": "启用参考音频",
-  "English": "英文",
-  "Error Message": "错误信息",
-  "File Preprocessing": "文件预处理",
-  "Generate": "生成",
-  "Generated Audio": "音频",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "如果音频没有对应的文本，可以应用 ASR 辅助，支持 .txt 或 .lab 格式",
-  "Infer interface is closed": "推理界面已关闭",
-  "Inference Configuration": "推理配置",
-  "Inference Server Configuration": "推理服务器配置",
-  "Inference Server Error": "推理服务器错误",
-  "Inferring interface is launched at {}": "推理界面已在 {} 上启动",
-  "Initial Learning Rate": "初始学习率",
-  "Input Audio & Source Path for Transcription": "输入音频和转录源路径",
-  "Input Text": "输入文本",
-  "Invalid path: {}": "无效路径: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "建议使用 CUDA，如果配置较低，使用 CPU",
-  "Iterative Prompt Length, 0 means off": "迭代提示长度，0 表示关闭",
-  "Japanese": "日文",
-  "LLAMA Configuration": "LLAMA 配置",
-  "LLAMA Model Config": "LLAMA 模型配置",
-  "LLAMA Model Path": "LLAMA 模型路径",
-  "Labeling Device": "标注加速设备",
-  "LoRA Model to be merged": "要合并的 LoRA 模型",
-  "Maximum Audio Duration": "最大音频时长",
-  "Maximum Length per Sample": "每个样本的最大长度",
-  "Maximum Training Steps": "最大训练步数",
-  "Maximum tokens per batch, 0 means no limit": "每批最大令牌数，0 表示无限制",
-  "Merge": "合并",
-  "Merge LoRA": "合并 LoRA",
-  "Merge successfully": "合并成功",
-  "Minimum Audio Duration": "最小音频时长",
-  "Model Output Path": "模型输出路径",
-  "Model Size": "模型规模",
-  "Move": "移动",
-  "Move files successfully": "移动文件成功",
-  "No audio generated, please check the input text.": "没有生成音频，请检查输入文本.",
-  "No selected options": "没有选择的选项",
-  "Number of Workers": "数据加载进程数",
-  "Open Inference Server": "打开推理服务器",
-  "Open Labeler WebUI": "打开标注工具",
-  "Open Tensorboard": "打开 Tensorboard",
-  "Opened labeler in browser": "在浏览器中打开标注工具",
-  "Optional Label Language": "[可选] 标注语言",
-  "Optional online ver": "[可选] 使用在线版",
-  "Output Path": "输出路径",
-  "Path error, please check the model file exists in the corresponding path": "路径错误，请检查模型文件是否存在于相应路径",
-  "Precision": "精度",
-  "Probability of applying Speaker Condition": "应用说话人条件的概率",
-  "Put your text here.": "在此处输入文本.",
-  "Reference Audio": "参考音频",
-  "Reference Text": "参考文本",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "相关代码和权重使用 CC BY-NC-SA 4.0 许可证发布.",
-  "Remove Selected Data": "移除选中数据",
-  "Removed path successfully!": "移除路径成功!",
-  "Repetition Penalty": "重复惩罚",
-  "Save model every n steps": "每 n 步保存模型",
-  "Select LLAMA ckpt": "选择 LLAMA 检查点",
-  "Select VITS ckpt": "选择 VITS 检查点",
-  "Select VQGAN ckpt": "选择 VQGAN 检查点",
-  "Select source file processing method": "选择源文件处理方法",
-  "Select the model to be trained (Depending on the Tab page you are on)": "根据您所在的选项卡页面选择要训练的模型",
-  "Selected: {}": "已选择: {}",
-  "Speaker": "说话人",
-  "Speaker is identified by the folder name": "自动根据父目录名称识别说话人",
-  "Start Training": "开始训练",
-  "Streaming Audio": "流式音频",
-  "Streaming Generate": "流式合成",
-  "Tensorboard Host": "Tensorboard 监听地址",
-  "Tensorboard Log Path": "Tensorboard 日志路径",
-  "Tensorboard Port": "Tensorboard 端口",
-  "Tensorboard interface is closed": "Tensorboard 界面已关闭",
-  "Tensorboard interface is launched at {}": "Tensorboard 界面已在 {} 上启动",
-  "Text is too long, please keep it under {} characters.": "文本太长，请保持在 {} 个字符以内.",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左侧输入文件夹的路径或文件列表。无论是否选中，都将在此列表中用于后续训练.",
-  "Training Configuration": "训练配置",
-  "Training Error": "训练错误",
-  "Training stopped": "训练已停止",
-  "Type name of the speaker": "输入说话人的名称",
-  "Type the path or select from the dropdown": "输入路径或从下拉菜单中选择",
-  "Use LoRA": "使用 LoRA",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "使用 LoRA 可以节省 GPU 内存，但可能会降低模型质量",
-  "Use filelist": "使用文件列表",
-  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 使用 large, 5G 使用 medium, 2G 使用 small",
-  "VITS Configuration": "VITS 配置",
-  "VQGAN Configuration": "VQGAN 配置",
-  "Validation Batch Size": "验证批次大小",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "查看预处理文件夹的状态 (使用滑块控制树的深度)",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.",
-  "WebUI Host": "WebUI 监听地址",
-  "WebUI Port": "WebUI 端口",
-  "Whisper Model": "Whisper 模型",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 系列 GPU 建议使用 bf16-true, 10+ 系列 GPU 建议使用 16-mixed",
-  "latest": "最近的检查点",
-  "new": "创建新的检查点",
-  "Realtime Transform Text": "实时规范化文本",
-  "Normalization Result Preview (Currently Only Chinese)": "规范化结果预览",
-  "Text Normalization": "文本规范化"
-}

+{
+  "16-mixed is recommended for 10+ series GPU": "10+ 系列 GPU 建议使用 16-mixed",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 到 10 秒的参考音频，适用于指定音色。",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.",
+  "Accumulate Gradient Batches": "梯度累积批次",
+  "Add to Processing Area": "加入处理区",
+  "Added path successfully!": "添加路径成功!",
+  "Advanced Config": "高级参数",
+  "Base LLAMA Model": "基础 LLAMA 模型",
+  "Batch Inference": "批量推理",
+  "Batch Size": "批次大小",
+  "Changing with the Model Path": "随模型路径变化",
+  "Chinese": "中文",
+  "Compile Model": "编译模型",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "编译模型可以显著减少推理时间，但会增加冷启动时间",
+  "Copy": "复制",
+  "Data Preprocessing": "数据预处理",
+  "Data Preprocessing Path": "数据预处理路径",
+  "Data Source": "数据源",
+  "Decoder Model Config": "解码器模型配置",
+  "Decoder Model Path": "解码器模型路径",
+  "Disabled": "禁用",
+  "Enable Reference Audio": "启用参考音频",
+  "English": "英文",
+  "Error Message": "错误信息",
+  "File Preprocessing": "文件预处理",
+  "Generate": "生成",
+  "Generated Audio": "音频",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "如果音频没有对应的文本，可以应用 ASR 辅助，支持 .txt 或 .lab 格式",
+  "Infer interface is closed": "推理界面已关闭",
+  "Inference Configuration": "推理配置",
+  "Inference Server Configuration": "推理服务器配置",
+  "Inference Server Error": "推理服务器错误",
+  "Inferring interface is launched at {}": "推理界面已在 {} 上启动",
+  "Initial Learning Rate": "初始学习率",
+  "Input Audio & Source Path for Transcription": "输入音频和转录源路径",
+  "Input Text": "输入文本",
+  "Invalid path: {}": "无效路径: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "建议使用 CUDA，如果配置较低，使��� CPU",
+  "Iterative Prompt Length, 0 means off": "迭代提示长度，0 表示关闭",
+  "Japanese": "日文",
+  "LLAMA Configuration": "LLAMA 配置",
+  "LLAMA Model Config": "LLAMA 模型配置",
+  "LLAMA Model Path": "LLAMA 模型路径",
+  "Labeling Device": "标注加速设备",
+  "LoRA Model to be merged": "要合并的 LoRA 模型",
+  "Maximum Audio Duration": "最大音频时长",
+  "Maximum Length per Sample": "每个样本的最大长度",
+  "Maximum Training Steps": "最大训练步数",
+  "Maximum tokens per batch, 0 means no limit": "每批最大令牌数，0 表示无限制",
+  "Merge": "合并",
+  "Merge LoRA": "合并 LoRA",
+  "Merge successfully": "合并成功",
+  "Minimum Audio Duration": "最小音频时长",
+  "Model Output Path": "模型输出路径",
+  "Model Size": "模型规模",
+  "Move": "移动",
+  "Move files successfully": "移动文件成功",
+  "No audio generated, please check the input text.": "没有生成音频，请检查输入文本.",
+  "No selected options": "没有选择的选项",
+  "Number of Workers": "数据加载进程数",
+  "Open Inference Server": "打开推理服务器",
+  "Open Labeler WebUI": "打开标注工具",
+  "Open Tensorboard": "打开 Tensorboard",
+  "Opened labeler in browser": "在浏览器中打开标注工具",
+  "Optional Label Language": "[可选] 标注语言",
+  "Optional online ver": "[可选] 使用在线版",
+  "Output Path": "输出路径",
+  "Path error, please check the model file exists in the corresponding path": "路径错误，请检查模型文件是否存在于相应路径",
+  "Precision": "精度",
+  "Probability of applying Speaker Condition": "应用说话人条件的概率",
+  "Put your text here.": "在此处输入文本.",
+  "Reference Audio": "参考音频",
+  "Reference Text": "参考文本",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "相关代码和权重使用 CC BY-NC-SA 4.0 许可证发布.",
+  "Remove Selected Data": "移除选中数据",
+  "Removed path successfully!": "移除路径成功!",
+  "Repetition Penalty": "重复惩罚",
+  "Save model every n steps": "每 n 步保存模型",
+  "Select LLAMA ckpt": "选择 LLAMA 检查点",
+  "Select VITS ckpt": "选择 VITS 检查点",
+  "Select VQGAN ckpt": "选择 VQGAN 检查点",
+  "Select source file processing method": "选择源文件处理方法",
+  "Select the model to be trained (Depending on the Tab page you are on)": "根据您所在的选项卡页面选择要训练的模型",
+  "Selected: {}": "已选择: {}",
+  "Speaker": "说话人",
+  "Speaker is identified by the folder name": "自动根据父目录名称识别说话人",
+  "Start Training": "开始训练",
+  "Streaming Audio": "流式音频",
+  "Streaming Generate": "流式合成",
+  "Tensorboard Host": "Tensorboard 监听地址",
+  "Tensorboard Log Path": "Tensorboard 日志路径",
+  "Tensorboard Port": "Tensorboard 端口",
+  "Tensorboard interface is closed": "Tensorboard 界面已关闭",
+  "Tensorboard interface is launched at {}": "Tensorboard 界面已在 {} 上启动",
+  "Text is too long, please keep it under {} characters.": "文本太长，请保持在 {} 个字符以内.",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左侧输入文件夹的路径或文件列表。无论是否选中，都将在此列表中用于后续训练.",
+  "Training Configuration": "训练配置",
+  "Training Error": "训练错误",
+  "Training stopped": "训练已停止",
+  "Type name of the speaker": "输入说话人的名称",
+  "Type the path or select from the dropdown": "输入路径或从下拉菜单中选择",
+  "Use LoRA": "使用 LoRA",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "使用 LoRA 可以节省 GPU 内存，但可能会降低模型质量",
+  "Use filelist": "使用文件列表",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 使用 large, 5G 使用 medium, 2G 使用 small",
+  "VITS Configuration": "VITS 配置",
+  "VQGAN Configuration": "VQGAN 配置",
+  "Validation Batch Size": "验证批次大小",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "查看预处理文件夹的状态 (使用滑块控制树的深度)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.",
+  "WebUI Host": "WebUI 监听地址",
+  "WebUI Port": "WebUI 端口",
+  "Whisper Model": "Whisper 模型",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 系列 GPU 建议使用 bf16-true, 10+ 系列 GPU 建议使用 16-mixed",
+  "latest": "最近的检查点",
+  "new": "创建新的检查点",
+  "Realtime Transform Text": "实时规范化文本",
+  "Normalization Result Preview (Currently Only Chinese)": "规范化结果预览",
+  "Text Normalization": "文本规范化",
+  "Select Example Audio": "选择参考音频"
+}

fish_speech/i18n/scan.py CHANGED Viewed

@@ -1,122 +1,122 @@
-import ast
-import glob
-import json
-from collections import OrderedDict
-from pathlib import Path
-from loguru import logger
-from .core import DEFAULT_LANGUAGE, I18N_FILE_PATH
-def extract_i18n_strings(node):
-    i18n_strings = []
-    if (
-        isinstance(node, ast.Call)
-        and isinstance(node.func, ast.Name)
-        and node.func.id == "i18n"
-    ):
-        for arg in node.args:
-            if isinstance(arg, ast.Str):
-                i18n_strings.append(arg.s)
-    for child_node in ast.iter_child_nodes(node):
-        i18n_strings.extend(extract_i18n_strings(child_node))
-    return i18n_strings
-# scan the directory for all .py files (recursively)
-# for each file, parse the code into an AST
-# for each AST, extract the i18n strings
-strings = []
-folders = ["fish_speech", "tools"]
-# for filename in glob.iglob("**/*.py", recursive=True):
-for folder in folders:
-    for f in Path(folder).rglob("*.py"):
-        code = f.read_text(encoding="utf-8")
-        if "i18n(" in code:
-            tree = ast.parse(code)
-            i18n_strings = extract_i18n_strings(tree)
-            logger.info(f"Found {len(i18n_strings)} i18n strings in {f}")
-            strings.extend(i18n_strings)
-code_keys = set(strings)
-logger.info(f"Total unique: {len(code_keys)}")
-standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
-with open(standard_file, "r", encoding="utf-8") as f:
-    standard_data = json.load(f, object_pairs_hook=OrderedDict)
-standard_keys = set(standard_data.keys())
-# Define the standard file name
-unused_keys = standard_keys - code_keys
-logger.info(f"Found {len(unused_keys)} unused keys in {standard_file}")
-for unused_key in unused_keys:
-    logger.info(f"\t{unused_key}")
-missing_keys = code_keys - standard_keys
-logger.info(f"Found {len(missing_keys)} missing keys in {standard_file}")
-for missing_key in missing_keys:
-    logger.info(f"\t{missing_key}")
-code_keys_dict = OrderedDict()
-for s in strings:
-    code_keys_dict[s] = s
-# write back
-with open(standard_file, "w", encoding="utf-8") as f:
-    json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
-    f.write("\n")
-logger.info(f"Updated {standard_file}")
-# Define the standard file name
-standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
-# Find all JSON files in the directory
-dir_path = I18N_FILE_PATH
-languages = [f for f in dir_path.glob("*.json") if f.stem != DEFAULT_LANGUAGE]
-# Load the standard file
-with open(standard_file, "r", encoding="utf-8") as f:
-    standard_data = json.load(f, object_pairs_hook=OrderedDict)
-# Loop through each language file
-for lang_file in languages:
-    # Load the language file
-    with open(lang_file, "r", encoding="utf-8") as f:
-        lang_data = json.load(f, object_pairs_hook=OrderedDict)
-    # Find the difference between the language file and the standard file
-    diff = set(standard_data.keys()) - set(lang_data.keys())
-    miss = set(lang_data.keys()) - set(standard_data.keys())
-    # Add any missing keys to the language file
-    for key in diff:
-        lang_data[key] = "#!" + key
-        logger.info(f"Added missing key: {key} to {lang_file}")
-    # Del any extra keys to the language file
-    for key in miss:
-        del lang_data[key]
-        logger.info(f"Del extra key: {key} from {lang_file}")
-    # Sort the keys of the language file to match the order of the standard file
-    lang_data = OrderedDict(
-        sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
-    )
-    # Save the updated language file
-    with open(lang_file, "w", encoding="utf-8") as f:
-        json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
-        f.write("\n")
-    logger.info(f"Updated {lang_file}")
-logger.info("Done")

+import ast
+import glob
+import json
+from collections import OrderedDict
+from pathlib import Path
+from loguru import logger
+from .core import DEFAULT_LANGUAGE, I18N_FILE_PATH
+def extract_i18n_strings(node):
+    i18n_strings = []
+    if (
+        isinstance(node, ast.Call)
+        and isinstance(node.func, ast.Name)
+        and node.func.id == "i18n"
+    ):
+        for arg in node.args:
+            if isinstance(arg, ast.Str):
+                i18n_strings.append(arg.s)
+    for child_node in ast.iter_child_nodes(node):
+        i18n_strings.extend(extract_i18n_strings(child_node))
+    return i18n_strings
+# scan the directory for all .py files (recursively)
+# for each file, parse the code into an AST
+# for each AST, extract the i18n strings
+strings = []
+folders = ["fish_speech", "tools"]
+# for filename in glob.iglob("**/*.py", recursive=True):
+for folder in folders:
+    for f in Path(folder).rglob("*.py"):
+        code = f.read_text(encoding="utf-8")
+        if "i18n(" in code:
+            tree = ast.parse(code)
+            i18n_strings = extract_i18n_strings(tree)
+            logger.info(f"Found {len(i18n_strings)} i18n strings in {f}")
+            strings.extend(i18n_strings)
+code_keys = set(strings)
+logger.info(f"Total unique: {len(code_keys)}")
+standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
+with open(standard_file, "r", encoding="utf-8") as f:
+    standard_data = json.load(f, object_pairs_hook=OrderedDict)
+standard_keys = set(standard_data.keys())
+# Define the standard file name
+unused_keys = standard_keys - code_keys
+logger.info(f"Found {len(unused_keys)} unused keys in {standard_file}")
+for unused_key in unused_keys:
+    logger.info(f"\t{unused_key}")
+missing_keys = code_keys - standard_keys
+logger.info(f"Found {len(missing_keys)} missing keys in {standard_file}")
+for missing_key in missing_keys:
+    logger.info(f"\t{missing_key}")
+code_keys_dict = OrderedDict()
+for s in strings:
+    code_keys_dict[s] = s
+# write back
+with open(standard_file, "w", encoding="utf-8") as f:
+    json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
+    f.write("\n")
+logger.info(f"Updated {standard_file}")
+# Define the standard file name
+standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
+# Find all JSON files in the directory
+dir_path = I18N_FILE_PATH
+languages = [f for f in dir_path.glob("*.json") if f.stem != DEFAULT_LANGUAGE]
+# Load the standard file
+with open(standard_file, "r", encoding="utf-8") as f:
+    standard_data = json.load(f, object_pairs_hook=OrderedDict)
+# Loop through each language file
+for lang_file in languages:
+    # Load the language file
+    with open(lang_file, "r", encoding="utf-8") as f:
+        lang_data = json.load(f, object_pairs_hook=OrderedDict)
+    # Find the difference between the language file and the standard file
+    diff = set(standard_data.keys()) - set(lang_data.keys())
+    miss = set(lang_data.keys()) - set(standard_data.keys())
+    # Add any missing keys to the language file
+    for key in diff:
+        lang_data[key] = "#!" + key
+        logger.info(f"Added missing key: {key} to {lang_file}")
+    # Del any extra keys to the language file
+    for key in miss:
+        del lang_data[key]
+        logger.info(f"Del extra key: {key} from {lang_file}")
+    # Sort the keys of the language file to match the order of the standard file
+    lang_data = OrderedDict(
+        sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
+    )
+    # Save the updated language file
+    with open(lang_file, "w", encoding="utf-8") as f:
+        json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
+        f.write("\n")
+    logger.info(f"Updated {lang_file}")
+logger.info("Done")

fish_speech/models/text2semantic/lit_module.py CHANGED Viewed

@@ -1,202 +1,202 @@
-from typing import Any, Optional
-import lightning as L
-import torch
-import torch.nn.functional as F
-from lightning.pytorch.utilities.types import OptimizerLRScheduler
-import fish_speech.utils as utils
-from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
-from fish_speech.models.text2semantic.llama import NaiveTransformer
-log = utils.RankedLogger(__name__, rank_zero_only=True)
-class TextToSemantic(L.LightningModule):
-    def __init__(
-        self,
-        model: NaiveTransformer,
-        optimizer: Any,
-        lr_scheduler: Any,
-    ):
-        super().__init__()
-        self.model = model
-        self.optimizer_builder = optimizer
-        self.lr_scheduler_builder = lr_scheduler
-    def forward(self, x):
-        return self.model(x)
-    def on_save_checkpoint(self, checkpoint):
-        # Save only LoRA parameters
-        state_dict = checkpoint["state_dict"]
-        use_lora = any("lora" in name for name in state_dict.keys())
-        if not use_lora:
-            return
-        for name in list(state_dict.keys()):
-            if "lora" not in name:
-                state_dict.pop(name)
-    def configure_optimizers(self) -> OptimizerLRScheduler:
-        # Get weight decay parameters
-        weight_decay_parameters, other_parameters = [], []
-        for name, param in self.named_parameters():
-            if ".bias" in name or "norm.weight" in name or ".embeddings." in name:
-                other_parameters.append(param)
-            else:
-                weight_decay_parameters.append(param)
-        optimizer = self.optimizer_builder(
-            [
-                {"params": weight_decay_parameters},
-                {"params": other_parameters, "weight_decay": 0.0},
-            ]
-        )
-        # Print the parameters and their weight decay
-        for i in optimizer.param_groups:
-            log.info(
-                f"Set weight decay: {i['weight_decay']} for {len(i['params'])} parameters"
-            )
-        lr_scheduler = self.lr_scheduler_builder(optimizer)
-        return {
-            "optimizer": optimizer,
-            "lr_scheduler": {
-                "scheduler": lr_scheduler,
-                "interval": "step",
-            },
-        }
-    # Copied from https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py#L90
-    def get_batch_logps(
-        self,
-        logits: torch.FloatTensor,
-        labels: torch.LongTensor,
-        average_log_prob: bool = False,
-    ) -> torch.FloatTensor:
-        """Compute the log probabilities of the given labels under the given logits.
-        Args:
-            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, codebook_size, vocab_size)
-            labels: Labels for which to compute the log probabilities. Label tokens with a value of -100 are ignored. Shape: (batch_size, sequence_length, codebook_size)
-            average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens.
-        Returns:
-            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits.
-        """
-        assert logits.shape[:-1] == labels.shape
-        labels = labels.clone()
-        loss_mask = labels != -100
-        # dummy token; we'll ignore the losses on these tokens later
-        labels[labels == -100] = 0
-        per_token_logps = torch.gather(
-            logits.log_softmax(-1), dim=-1, index=labels.unsqueeze(-1)
-        ).squeeze(-1)
-        if average_log_prob:
-            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
-        else:
-            return (per_token_logps * loss_mask).sum(-1)
-    def _step(self, batch, batch_idx, stage: str):
-        is_train = stage == "train"
-        if is_train:
-            # Key part to make lora work
-            # Otherwise the parameters are merged, which lead to incorrect gradients
-            self.model.train()
-        # Do positive and negative samples in the same batch to speed up training
-        labels = batch["labels"]
-        outputs = self.model(
-            inp=batch["inputs"],
-            key_padding_mask=batch["attention_masks"],
-        )
-        token_logits = outputs.token_logits
-        codebook_logits = outputs.codebook_logits
-        # Generate labels
-        base_loss = F.cross_entropy(
-            token_logits.view(-1, token_logits.size(-1)),
-            labels[:, 0].reshape(-1),
-            ignore_index=-100,
-        )
-        codebook_labels = labels[:, 1 : 1 + self.model.config.num_codebooks].mT
-        semantic_loss = F.cross_entropy(
-            codebook_logits.view(-1, codebook_logits.size(-1)),
-            codebook_labels.reshape(-1),
-            ignore_index=-100,
-        )
-        loss = base_loss + semantic_loss
-        self.log(
-            f"{stage}/loss",
-            loss,
-            on_step=is_train,
-            on_epoch=not is_train,
-            prog_bar=True,
-            logger=True,
-            sync_dist=not is_train,
-        )
-        self.log(
-            f"{stage}/base_loss",
-            base_loss,
-            on_step=is_train,
-            on_epoch=not is_train,
-            prog_bar=False,
-            logger=True,
-            sync_dist=not is_train,
-        )
-        self.log(
-            f"{stage}/semantic_loss",
-            semantic_loss,
-            on_step=is_train,
-            on_epoch=not is_train,
-            prog_bar=False,
-            logger=True,
-            sync_dist=not is_train,
-        )
-        # Top-5 accuracy
-        accuracy = self.get_accuracy(codebook_logits, codebook_labels)
-        self.log(
-            f"{stage}/top_5_accuracy",
-            accuracy,
-            on_step=is_train,
-            on_epoch=not is_train,
-            prog_bar=True,
-            logger=True,
-            sync_dist=not is_train,
-        )
-        return loss
-    def get_accuracy(self, logits, labels):
-        mask = (labels != -100) & (labels != CODEBOOK_PAD_TOKEN_ID)
-        if mask.sum() == 0:
-            return torch.tensor(0.0, device=logits.device)
-        _, indices = logits.topk(5, dim=-1)
-        correct = indices.eq(labels.unsqueeze(-1))
-        correct[~mask] = 0
-        correct = correct.sum()
-        accuracy = correct / mask.sum()
-        return accuracy
-    def training_step(self, batch, batch_idx):
-        return self._step(batch, batch_idx, "train")
-    def validation_step(self, batch, batch_idx):
-        return self._step(batch, batch_idx, "val")

+from typing import Any, Optional
+import lightning as L
+import torch
+import torch.nn.functional as F
+from lightning.pytorch.utilities.types import OptimizerLRScheduler
+import fish_speech.utils as utils
+from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
+from fish_speech.models.text2semantic.llama import NaiveTransformer
+log = utils.RankedLogger(__name__, rank_zero_only=True)
+class TextToSemantic(L.LightningModule):
+    def __init__(
+        self,
+        model: NaiveTransformer,
+        optimizer: Any,
+        lr_scheduler: Any,
+    ):
+        super().__init__()
+        self.model = model
+        self.optimizer_builder = optimizer
+        self.lr_scheduler_builder = lr_scheduler
+    def forward(self, x):
+        return self.model(x)
+    def on_save_checkpoint(self, checkpoint):
+        # Save only LoRA parameters
+        state_dict = checkpoint["state_dict"]
+        use_lora = any("lora" in name for name in state_dict.keys())
+        if not use_lora:
+            return
+        for name in list(state_dict.keys()):
+            if "lora" not in name:
+                state_dict.pop(name)
+    def configure_optimizers(self) -> OptimizerLRScheduler:
+        # Get weight decay parameters
+        weight_decay_parameters, other_parameters = [], []
+        for name, param in self.named_parameters():
+            if ".bias" in name or "norm.weight" in name or ".embeddings." in name:
+                other_parameters.append(param)
+            else:
+                weight_decay_parameters.append(param)
+        optimizer = self.optimizer_builder(
+            [
+                {"params": weight_decay_parameters},
+                {"params": other_parameters, "weight_decay": 0.0},
+            ]
+        )
+        # Print the parameters and their weight decay
+        for i in optimizer.param_groups:
+            log.info(
+                f"Set weight decay: {i['weight_decay']} for {len(i['params'])} parameters"
+            )
+        lr_scheduler = self.lr_scheduler_builder(optimizer)
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {
+                "scheduler": lr_scheduler,
+                "interval": "step",
+            },
+        }
+    # Copied from https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py#L90
+    def get_batch_logps(
+        self,
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+        Args:
+            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, codebook_size, vocab_size)
+            labels: Labels for which to compute the log probabilities. Label tokens with a value of -100 are ignored. Shape: (batch_size, sequence_length, codebook_size)
+            average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens.
+        Returns:
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits.
+        """
+        assert logits.shape[:-1] == labels.shape
+        labels = labels.clone()
+        loss_mask = labels != -100
+        # dummy token; we'll ignore the losses on these tokens later
+        labels[labels == -100] = 0
+        per_token_logps = torch.gather(
+            logits.log_softmax(-1), dim=-1, index=labels.unsqueeze(-1)
+        ).squeeze(-1)
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            return (per_token_logps * loss_mask).sum(-1)
+    def _step(self, batch, batch_idx, stage: str):
+        is_train = stage == "train"
+        if is_train:
+            # Key part to make lora work
+            # Otherwise the parameters are merged, which lead to incorrect gradients
+            self.model.train()
+        # Do positive and negative samples in the same batch to speed up training
+        labels = batch["labels"]
+        outputs = self.model(
+            inp=batch["inputs"],
+            key_padding_mask=batch["attention_masks"],
+        )
+        token_logits = outputs.token_logits
+        codebook_logits = outputs.codebook_logits
+        # Generate labels
+        base_loss = F.cross_entropy(
+            token_logits.view(-1, token_logits.size(-1)),
+            labels[:, 0].reshape(-1),
+            ignore_index=-100,
+        )
+        codebook_labels = labels[:, 1 : 1 + self.model.config.num_codebooks].mT
+        semantic_loss = F.cross_entropy(
+            codebook_logits.view(-1, codebook_logits.size(-1)),
+            codebook_labels.reshape(-1),
+            ignore_index=-100,
+        )
+        loss = base_loss + semantic_loss
+        self.log(
+            f"{stage}/loss",
+            loss,
+            on_step=is_train,
+            on_epoch=not is_train,
+            prog_bar=True,
+            logger=True,
+            sync_dist=not is_train,
+        )
+        self.log(
+            f"{stage}/base_loss",
+            base_loss,
+            on_step=is_train,
+            on_epoch=not is_train,
+            prog_bar=False,
+            logger=True,
+            sync_dist=not is_train,
+        )
+        self.log(
+            f"{stage}/semantic_loss",
+            semantic_loss,
+            on_step=is_train,
+            on_epoch=not is_train,
+            prog_bar=False,
+            logger=True,
+            sync_dist=not is_train,
+        )
+        # Top-5 accuracy
+        accuracy = self.get_accuracy(codebook_logits, codebook_labels)
+        self.log(
+            f"{stage}/top_5_accuracy",
+            accuracy,
+            on_step=is_train,
+            on_epoch=not is_train,
+            prog_bar=True,
+            logger=True,
+            sync_dist=not is_train,
+        )
+        return loss
+    def get_accuracy(self, logits, labels):
+        mask = (labels != -100) & (labels != CODEBOOK_PAD_TOKEN_ID)
+        if mask.sum() == 0:
+            return torch.tensor(0.0, device=logits.device)
+        _, indices = logits.topk(5, dim=-1)
+        correct = indices.eq(labels.unsqueeze(-1))
+        correct[~mask] = 0
+        correct = correct.sum()
+        accuracy = correct / mask.sum()
+        return accuracy
+    def training_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx, "train")
+    def validation_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx, "val")

fish_speech/models/text2semantic/llama.py CHANGED Viewed

@@ -1,779 +1,887 @@
-import json
-import math
-from collections import OrderedDict
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Optional
-import torch
-import torch.nn as nn
-from einops import rearrange
-from loguru import logger
-from torch import Tensor
-from torch.nn import functional as F
-from torch.nn.attention import SDPBackend, sdpa_kernel
-from torch.utils.checkpoint import checkpoint
-from transformers import AutoTokenizer
-from fish_speech.conversation import SEMANTIC_TOKEN
-from fish_speech.utils import RankedLogger
-from .lora import LoraConfig, setup_lora
-log = RankedLogger(__name__, rank_zero_only=True)
-def find_multiple(n: int, k: int) -> int:
-    if n % k == 0:
-        return n
-    return n + k - (n % k)
-@dataclass
-class BaseModelArgs:
-    model_type: str = "base"
-    vocab_size: int = 32000
-    n_layer: int = 32
-    n_head: int = 32
-    dim: int = 4096
-    intermediate_size: int = None
-    n_local_heads: int = -1
-    head_dim: int = 64
-    rope_base: float = 10000
-    norm_eps: float = 1e-5
-    max_seq_len: int = 2048
-    dropout: float = 0.0
-    tie_word_embeddings: bool = True
-    attention_qkv_bias: bool = False
-    # Codebook configs
-    codebook_size: int = 160
-    num_codebooks: int = 4
-    # Gradient checkpointing
-    use_gradient_checkpointing: bool = True
-    # Initialize the model
-    initializer_range: float = 0.02
-    def __post_init__(self):
-        if self.n_local_heads == -1:
-            self.n_local_heads = self.n_head
-        if self.intermediate_size is None:
-            hidden_dim = 4 * self.dim
-            n_hidden = int(2 * hidden_dim / 3)
-            self.intermediate_size = find_multiple(n_hidden, 256)
-        self.head_dim = self.dim // self.n_head
-    @staticmethod
-    def from_pretrained(path: str):
-        path = Path(path)
-        if path.is_dir():
-            path = path / "config.json"
-        with open(path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-        match data["model_type"]:
-            case "naive":
-                cls = NaiveModelArgs
-            case "dual_ar":
-                cls = DualARModelArgs
-            case _:
-                raise ValueError(f"Unknown model type: {data['model_type']}")
-        return cls(**data)
-    def save(self, path: str):
-        with open(path, "w") as f:
-            json.dump(self.__dict__, f, indent=4, sort_keys=True, ensure_ascii=False)
-@dataclass
-class NaiveModelArgs(BaseModelArgs):
-    model_type: str = "naive"
-@dataclass
-class DualARModelArgs(BaseModelArgs):
-    model_type: str = "dual_ar"
-    n_fast_layer: int = 4
-class KVCache(nn.Module):
-    def __init__(
-        self, max_batch_size, max_seq_len, n_heads, head_dim, dtype=torch.bfloat16
-    ):
-        super().__init__()
-        cache_shape = (max_batch_size, n_heads, max_seq_len, head_dim)
-        self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
-        self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))
-    def update(self, input_pos, k_val, v_val):
-        # input_pos: [S], k_val: [B, H, S, D]
-        assert input_pos.shape[0] == k_val.shape[2]
-        k_out = self.k_cache
-        v_out = self.v_cache
-        k_out[:, :, input_pos] = k_val
-        v_out[:, :, input_pos] = v_val
-        return k_out, v_out
-@dataclass
-class TransformerForwardResult:
-    token_logits: Tensor
-    codebook_logits: Tensor
-@dataclass
-class BaseTransformerForwardResult:
-    logits: Tensor
-    hidden_states: Tensor
-class BaseTransformer(nn.Module):
-    def __init__(
-        self, config: BaseModelArgs, tokenizer: AutoTokenizer, init_weights: bool = True
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.tokenizer = tokenizer
-        self.semantic_token_id = tokenizer.convert_tokens_to_ids(SEMANTIC_TOKEN)
-        # Slow transformer
-        self.embeddings = nn.Embedding(
-            config.vocab_size,
-            config.dim,
-        )
-        self.codebook_embeddings = nn.Embedding(
-            config.codebook_size * config.num_codebooks,
-            config.dim,
-        )
-        self.layers = nn.ModuleList(
-            TransformerBlock(config, use_sdpa=True) for _ in range(config.n_layer)
-        )
-        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
-        if self.config.tie_word_embeddings is False:
-            self.output = nn.Linear(
-                config.dim,
-                config.vocab_size,
-                bias=False,
-            )
-        self.register_buffer(
-            "freqs_cis",
-            precompute_freqs_cis(
-                config.max_seq_len,
-                config.dim // config.n_head,
-                config.rope_base,
-            ),
-            persistent=False,
-        )
-        self.register_buffer(
-            "causal_mask",
-            torch.tril(
-                torch.ones(
-                    config.max_seq_len,
-                    config.max_seq_len,
-                    dtype=torch.bool,
-                )
-            ),
-            persistent=False,
-        )
-        # For kv cache
-        self.max_batch_size = -1
-        self.max_seq_len = -1
-        if init_weights:
-            self.apply(self._init_weights)
-    def setup_caches(
-        self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
-    ):
-        if self.max_seq_len >= max_seq_len and self.max_batch_size >= max_batch_size:
-            return
-        head_dim = self.config.dim // self.config.n_head
-        max_seq_len = find_multiple(max_seq_len, 8)
-        self.max_seq_len = max_seq_len
-        self.max_batch_size = max_batch_size
-        for b in self.layers:
-            b.attention.kv_cache = KVCache(
-                max_batch_size,
-                max_seq_len,
-                self.config.n_local_heads,
-                head_dim,
-                dtype=dtype,
-            )
-    def embed(self, x: Tensor) -> Tensor:
-        vocab_embeds = [self.embeddings(x[:, 0])]
-        for i in range(self.config.num_codebooks):
-            emb = self.codebook_embeddings(x[:, i + 1] + i * self.config.codebook_size)
-            emb[x[:, 0] != self.semantic_token_id] = 0
-            vocab_embeds.append(emb)
-        x = torch.stack(vocab_embeds, dim=3)
-        x = x.sum(dim=3)
-        return x
-    def forward(
-        self,
-        inp: Tensor,
-        key_padding_mask: Optional[Tensor] = None,
-    ) -> BaseTransformerForwardResult:
-        seq_len = inp.size(2)
-        # Here we want to merge the embeddings of the codebooks
-        x = self.embed(inp)
-        freqs_cis = self.freqs_cis[:seq_len]
-        # Not that the causal mask here follows the definition of scaled_dot_product_attention
-        # That is, FALSE means masked out
-        # To maintain consistency, key_padding_mask use TRUE to mask out
-        mask = None
-        if key_padding_mask is not None:
-            mask = self.causal_mask[None, None, :seq_len, :seq_len]  # (B, N, Q, K)
-            mask = mask & key_padding_mask[:, None, None, :].logical_not()
-        for layer in self.layers:
-            if self.config.use_gradient_checkpointing and self.training:
-                x = checkpoint(layer, x, freqs_cis, mask, use_reentrant=True)
-            else:
-                x = layer(x, freqs_cis, mask)
-        # We got slow_out here
-        slow_out = self.norm(x)
-        if self.config.tie_word_embeddings:
-            token_logits = F.linear(slow_out, self.embeddings.weight)
-        else:
-            token_logits = self.output(slow_out)
-        return BaseTransformerForwardResult(
-            logits=token_logits,
-            hidden_states=x,
-        )
-    def forward_generate(
-        self,
-        x: Tensor,
-        input_pos: Optional[Tensor] = None,
-        return_all: bool = False,
-    ) -> BaseTransformerForwardResult:
-        # This is used for generation, optimized for torch compile
-        assert (
-            self.max_seq_len != -1 and self.max_batch_size != -1
-        ), "Please call setup_caches before forward_generate"
-        x = self.embed(x)
-        mask = self.causal_mask[
-            None, None, input_pos, : self.max_seq_len
-        ]  # (B, N, Q, K)
-        freqs_cis = self.freqs_cis[input_pos]
-        for layer in self.layers:
-            x = layer(x, freqs_cis, mask, input_pos=input_pos)
-        # If prefill, we only calculate the logits of last token
-        if x.size(1) > 1 and not return_all:
-            x = x[:, -1:]
-        # We got slow_out here
-        slow_out = self.norm(x)
-        if self.config.tie_word_embeddings:
-            token_logits = F.linear(slow_out, self.embeddings.weight)
-        else:
-            token_logits = self.output(slow_out)
-        return BaseTransformerForwardResult(
-            logits=token_logits,
-            hidden_states=x,
-        )
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-    @staticmethod
-    def from_pretrained(
-        path: str,
-        load_weights: bool = False,
-        max_length: int | None = None,
-        lora_config: LoraConfig | None = None,
-        rope_base: int | None = None,
-    ) -> "BaseTransformer":
-        config = BaseModelArgs.from_pretrained(str(path))
-        if max_length is not None:
-            config.max_seq_len = max_length
-            log.info(f"Override max_seq_len to {max_length}")
-        if rope_base is not None:
-            config.rope_base = rope_base
-            log.info(f"Override rope_base to {rope_base}")
-        match config.model_type:
-            case "naive":
-                model_cls = NaiveTransformer
-            case "dual_ar":
-                model_cls = DualARTransformer
-            case _:
-                raise ValueError(f"Unknown model type: {config.model_type}")
-        tokenizer = AutoTokenizer.from_pretrained(str(path))
-        log.info(f"Loading model from {path}, config: {config}")
-        model = model_cls(config, tokenizer=tokenizer)
-        if lora_config is not None:
-            setup_lora(model, lora_config)
-            log.info(f"LoRA setup: {lora_config}")
-        if load_weights is False:
-            log.info("Randomly initialized model")
-        else:
-            if "int8" in str(Path(path)):
-                logger.info("Using int8 weight-only quantization!")
-                from tools.llama.quantize import WeightOnlyInt8QuantHandler
-                simple_quantizer = WeightOnlyInt8QuantHandler(model)
-                model = simple_quantizer.convert_for_runtime()
-            if "int4" in str(Path(path)):
-                logger.info("Using int4 quantization!")
-                path_comps = path.name.split("-")
-                assert path_comps[-2].startswith("g")
-                groupsize = int(path_comps[-2][1:])
-                from tools.llama.quantize import WeightOnlyInt4QuantHandler
-                simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
-                model = simple_quantizer.convert_for_runtime()
-            weights = torch.load(
-                Path(path) / "model.pth", map_location="cpu", mmap=True
-            )
-            if "state_dict" in weights:
-                logger.warning(
-                    "Using a TextToSemantic LightningModule checkpoint, "
-                    "please make sure it is a full model, not a LoRA model."
-                )
-                weights = weights["state_dict"]
-            if next(iter(weights.keys())).startswith("model."):
-                logger.info(
-                    f"Remove prefix 'model.' created by TextToSemantic LightningModule from keys"
-                )
-                new_weights = OrderedDict()
-                for k, v in weights.items():
-                    new_weights[k.replace("model.", "")] = v
-                weights = new_weights
-            # Verify the name and shape of parameters since strict=False in load_state_dict.
-            for k, v in model.named_parameters():
-                if k not in weights:
-                    logger.warning(f"No weight for {k}")
-                elif v.shape != weights[k].shape:
-                    logger.warning(
-                        f"Shape mismatch for {k}: {v.shape} vs {weights[k].shape}"
-                    )
-            err = model.load_state_dict(weights, strict=False, assign=True)
-            log.info(f"Loaded weights with error: {err}")
-        return model
-    def save_pretrained(self, path: str, drop_lora: bool = False):
-        path = Path(path)
-        path.mkdir(parents=True, exist_ok=True)
-        self.config.save(path / "config.json")
-        state_dict = self.state_dict()
-        if drop_lora:
-            for key in list(state_dict.keys()):
-                if "lora" not in key:
-                    continue
-                state_dict.pop(key)
-                log.info(f"Drop LoRA parameter: {key}")
-        torch.save(state_dict, path / "model.pth")
-        self.tokenizer.save_pretrained(path)
-class NaiveTransformer(BaseTransformer):
-    def __init__(self, config: NaiveModelArgs, tokenizer: AutoTokenizer) -> None:
-        super().__init__(config, init_weights=False, tokenizer=tokenizer)
-        self.codebook_norm = RMSNorm(config.dim, eps=config.norm_eps)
-        self.codebook_output = nn.Linear(
-            config.dim,
-            config.codebook_size * config.num_codebooks,
-            bias=False,
-        )
-        self.apply(self._init_weights)
-    def decode(self, result: BaseTransformerForwardResult) -> TransformerForwardResult:
-        token_logits = result.logits
-        x = result.hidden_states
-        # Codebook
-        codebook_logits = self.codebook_output(self.codebook_norm(x))
-        codebook_logits = rearrange(
-            codebook_logits, "b n (c d) -> b n c d", c=self.config.num_codebooks
-        )
-        return TransformerForwardResult(
-            token_logits=token_logits,
-            codebook_logits=codebook_logits,
-        )
-    def forward(
-        self,
-        inp: Tensor,
-        key_padding_mask: Optional[Tensor] = None,
-    ) -> TransformerForwardResult:
-        result = super().forward(
-            inp=inp,
-            key_padding_mask=key_padding_mask,
-        )
-        return self.decode(result)
-    def forward_generate(
-        self, x: Tensor, input_pos: Optional[Tensor] = None
-    ) -> TransformerForwardResult:
-        result = super().forward_generate(x, input_pos)
-        return self.decode(result)
-class DualARTransformer(BaseTransformer):
-    def __init__(self, config: NaiveModelArgs, tokenizer: AutoTokenizer) -> None:
-        super().__init__(config, init_weights=False, tokenizer=tokenizer)
-        # Fast transformer
-        self.fast_embeddings = nn.Embedding(config.codebook_size, config.dim)
-        # The equivalent bs is so large that sdpa doesn't work
-        self.fast_layers = nn.ModuleList(
-            TransformerBlock(config, use_sdpa=False) for _ in range(config.n_fast_layer)
-        )
-        self.fast_norm = RMSNorm(config.dim, eps=config.norm_eps)
-        self.fast_output = nn.Linear(
-            config.dim,
-            config.codebook_size,
-            bias=False,
-        )
-        self.apply(self._init_weights)
-    def setup_caches(
-        self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
-    ):
-        super().setup_caches(max_batch_size, max_seq_len, dtype)
-        head_dim = self.config.dim // self.config.n_head
-        # Fast transformer
-        # The max seq len here is the number of codebooks
-        for b in self.fast_layers:
-            b.attention.kv_cache = KVCache(
-                max_batch_size,
-                self.config.num_codebooks,
-                self.config.n_local_heads,
-                head_dim,
-                dtype=dtype,
-            )
-    def forward(
-        self,
-        inp: Tensor,
-        key_padding_mask: Optional[Tensor] = None,
-    ) -> TransformerForwardResult:
-        parent_result = super().forward(inp, key_padding_mask)
-        token_logits = parent_result.logits
-        x = parent_result.hidden_states
-        # Fast transformer
-        fast_seq_len = self.config.num_codebooks
-        fast_mask = self.causal_mask[
-            None, None, :fast_seq_len, :fast_seq_len
-        ]  # (B, N, Q, K)
-        fast_freqs_cis = self.freqs_cis[:fast_seq_len]
-        # Drop the last token and rotate left
-        codebooks = inp[:, 1:-1, 1:]
-        codebooks = F.pad(codebooks, (0, 1), value=0)
-        codebook_embeddings = self.fast_embeddings(codebooks)
-        x = torch.cat([x[:, None], codebook_embeddings], dim=1)
-        b, s = x.size(0), x.size(2)
-        x = rearrange(x, "b n s d -> (b s) n d")  # flatten the batch and seq_len
-        # Remove padded part
-        codebooks = rearrange(codebooks, "b n s -> (b s) n")
-        codebook_mask = (codebooks == 0).all(dim=-1)
-        if torch.all(codebook_mask):
-            # If all codebooks are padded, we keep first 8 to make sure the model runs
-            codebook_mask[:8] = False
-        x_bs, x_len = x.size(0), x.size(1)
-        x = x[~codebook_mask]
-        for layer in self.fast_layers:
-            if self.config.use_gradient_checkpointing and self.training:
-                x = checkpoint(layer, x, fast_freqs_cis, fast_mask, use_reentrant=True)
-            else:
-                x = layer(x, fast_freqs_cis, fast_mask)
-        # unflatten the batch and num_codebooks
-        fast_out = self.fast_norm(x)
-        codebook_logits = self.fast_output(fast_out)
-        # Re-pad the codebook_logits
-        buffer = torch.zeros(
-            x_bs,
-            x_len,
-            codebook_logits.size(-1),
-            device=codebook_logits.device,
-            dtype=codebook_logits.dtype,
-        )
-        buffer[~codebook_mask] = codebook_logits
-        codebook_logits = buffer
-        assert codebook_logits.shape[1] == self.config.num_codebooks
-        codebook_logits = rearrange(
-            codebook_logits,
-            "(b s) n d -> b s n d",
-            b=b,
-            s=s,
-            n=self.config.num_codebooks,
-        )
-        return TransformerForwardResult(
-            token_logits=token_logits,
-            codebook_logits=codebook_logits,
-        )
-    def forward_generate_fast(
-        self, x: Tensor, input_pos: Optional[Tensor] = None
-    ) -> Tensor:
-        # Fast transformer
-        x = x.view(1, 1, -1)
-        fast_mask = self.causal_mask[
-            None, None, input_pos, : self.config.num_codebooks
-        ]  # (B, N, Q, K)
-        fast_freqs_cis = self.freqs_cis[input_pos]
-        for layer in self.fast_layers:
-            x = layer(x, fast_freqs_cis, fast_mask, input_pos=input_pos)
-        # unflatten the batch and num_codebooks
-        fast_out = self.fast_norm(x)  # only take the last token
-        codebook_logits = self.fast_output(fast_out)
-        return codebook_logits
-class TransformerBlock(nn.Module):
-    def __init__(self, config: BaseModelArgs, use_sdpa: bool = True) -> None:
-        super().__init__()
-        self.attention = Attention(config, use_sdpa=use_sdpa)
-        self.feed_forward = FeedForward(config)
-        self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
-        self.attention_norm = RMSNorm(config.dim, config.norm_eps)
-    def forward(
-        self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Tensor = None
-    ) -> Tensor:
-        h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
-        out = h + self.feed_forward(self.ffn_norm(h))
-        return out
-class Attention(nn.Module):
-    def __init__(self, config: BaseModelArgs, use_sdpa: bool = True):
-        super().__init__()
-        assert config.dim % config.n_head == 0
-        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
-        # key, query, value projections for all heads, but in a batch
-        self.wqkv = nn.Linear(
-            config.dim, total_head_dim, bias=config.attention_qkv_bias
-        )
-        self.wo = nn.Linear(config.dim, config.dim, bias=False)
-        self.kv_cache = None
-        self.dropout = config.dropout
-        self.n_head = config.n_head
-        self.head_dim = config.head_dim
-        self.n_local_heads = config.n_local_heads
-        self.dim = config.dim
-        self.use_sdpa = use_sdpa
-        self._register_load_state_dict_pre_hook(self.load_hook)
-    def load_hook(self, state_dict, prefix, *args):
-        if prefix + "wq.weight" in state_dict:
-            wq = state_dict.pop(prefix + "wq.weight")
-            wk = state_dict.pop(prefix + "wk.weight")
-            wv = state_dict.pop(prefix + "wv.weight")
-            state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])
-    def forward(
-        self,
-        x: Tensor,
-        freqs_cis: Tensor,
-        mask: Tensor,
-        input_pos: Optional[Tensor] = None,
-    ) -> Tensor:
-        bsz, seqlen, _ = x.shape
-        kv_size = self.n_local_heads * self.head_dim
-        q, k, v = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)
-        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
-        k = k.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        q = apply_rotary_emb(q, freqs_cis)
-        k = apply_rotary_emb(k, freqs_cis)
-        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
-        if self.kv_cache is not None:
-            k, v = self.kv_cache.update(input_pos, k, v)
-        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
-        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
-        if self.use_sdpa:
-            if mask is None:
-                with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
-                    y = F.scaled_dot_product_attention(
-                        q,
-                        k,
-                        v,
-                        dropout_p=self.dropout if self.training else 0.0,
-                        is_causal=True,
-                        # No third party attn_mask here to use flash_attention
-                    )
-            else:
-                y = F.scaled_dot_product_attention(
-                    q,
-                    k,
-                    v,
-                    attn_mask=mask,
-                    dropout_p=self.dropout if self.training else 0.0,
-                )
-        else:
-            y = self.eq_scaled_dot_product_attention(
-                q,
-                k,
-                v,
-                attn_mask=mask,
-                dropout_p=self.dropout if self.training else 0.0,
-            )
-        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
-        return self.wo(y)
-    def eq_scaled_dot_product_attention(
-        self,
-        query,
-        key,
-        value,
-        attn_mask=None,
-        dropout_p=0.0,
-    ) -> torch.Tensor:
-        # This is a standard scaled dot product attention
-        # It's low efficient, but it doesn't raise cuda error
-        L, S = query.size(-2), key.size(-2)
-        scale_factor = 1 / math.sqrt(query.size(-1))
-        attn_bias = torch.zeros(1, 1, L, S, dtype=query.dtype, device=query.device)
-        if attn_mask is not None:
-            if attn_mask.dtype == torch.bool:
-                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
-            else:
-                attn_bias += attn_mask
-        attn_weight = query @ key.transpose(-2, -1) * scale_factor
-        attn_weight += attn_bias
-        attn_weight = torch.softmax(attn_weight, dim=-1)
-        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
-        return attn_weight @ value
-class FeedForward(nn.Module):
-    def __init__(self, config: BaseModelArgs) -> None:
-        super().__init__()
-        self.w1 = nn.Linear(config.dim, config.intermediate_size, bias=False)
-        self.w3 = nn.Linear(config.dim, config.intermediate_size, bias=False)
-        self.w2 = nn.Linear(config.intermediate_size, config.dim, bias=False)
-    def forward(self, x: Tensor) -> Tensor:
-        return self.w2(F.silu(self.w1(x)) * self.w3(x))
-class RMSNorm(nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-5):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def _norm(self, x):
-        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
-    def forward(self, x: Tensor) -> Tensor:
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000) -> Tensor:
-    freqs = 1.0 / (
-        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
-    )
-    t = torch.arange(seq_len, device=freqs.device)
-    freqs = torch.outer(t, freqs)
-    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
-    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
-    return cache.to(dtype=torch.bfloat16)
-def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
-    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
-    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
-    x_out2 = torch.stack(
-        [
-            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
-            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
-        ],
-        -1,
-    )
-    x_out2 = x_out2.flatten(3)
-    return x_out2.type_as(x)

+import dataclasses
+import json
+import math
+from collections import OrderedDict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.nn as nn
+from einops import rearrange
+from loguru import logger
+from torch import Tensor
+from torch.nn import functional as F
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from torch.utils.checkpoint import checkpoint
+from transformers import AutoTokenizer
+from fish_speech.tokenizer import SEMANTIC_TOKENS, FishTokenizer
+from fish_speech.utils import RankedLogger
+from .lora import LoraConfig, setup_lora
+log = RankedLogger(__name__, rank_zero_only=True)
+def find_multiple(n: int, k: int) -> int:
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+@dataclass
+class BaseModelArgs:
+    model_type: str = "base"
+    vocab_size: int = 32000
+    n_layer: int = 32
+    n_head: int = 32
+    dim: int = 4096
+    intermediate_size: int = None
+    n_local_heads: int = -1
+    head_dim: int = 64
+    rope_base: float = 10000
+    norm_eps: float = 1e-5
+    max_seq_len: int = 2048
+    dropout: float = 0.0
+    tie_word_embeddings: bool = True
+    attention_qkv_bias: bool = False
+    # Codebook configs
+    codebook_size: int = 160
+    num_codebooks: int = 4
+    # Gradient checkpointing
+    use_gradient_checkpointing: bool = True
+    # Initialize the model
+    initializer_range: float = 0.02
+    # Dummy vars
+    is_reward_model: bool = False
+    share_codebook_embeddings: bool = True
+    scale_codebook_embeddings: bool = False
+    def __post_init__(self):
+        if self.n_local_heads == -1:
+            self.n_local_heads = self.n_head
+        if self.intermediate_size is None:
+            hidden_dim = 4 * self.dim
+            n_hidden = int(2 * hidden_dim / 3)
+            self.intermediate_size = find_multiple(n_hidden, 256)
+        self.head_dim = self.dim // self.n_head
+    @staticmethod
+    def from_pretrained(path: str):
+        path = Path(path)
+        if path.is_dir():
+            path = path / "config.json"
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        match data["model_type"]:
+            case "naive":
+                cls = NaiveModelArgs
+            case "dual_ar":
+                cls = DualARModelArgs
+            case _:
+                raise ValueError(f"Unknown model type: {data['model_type']}")
+        return cls(**data)
+    def save(self, path: str):
+        with open(path, "w") as f:
+            json.dump(self.__dict__, f, indent=4, sort_keys=True, ensure_ascii=False)
+@dataclass
+class NaiveModelArgs(BaseModelArgs):
+    model_type: str = "naive"
+@dataclass
+class DualARModelArgs(BaseModelArgs):
+    model_type: str = "dual_ar"
+    n_fast_layer: int = 4
+    fast_dim: int | None = None
+    fast_n_head: int | None = None
+    fast_n_local_heads: int | None = None
+    fast_head_dim: int | None = None
+    fast_intermediate_size: int | None = None
+    fast_attention_qkv_bias: bool | None = None
+    def __post_init__(self):
+        super().__post_init__()
+        self.fast_dim = self.fast_dim or self.dim
+        self.fast_n_head = self.fast_n_head or self.n_head
+        self.fast_n_local_heads = self.fast_n_local_heads or self.n_local_heads
+        self.fast_head_dim = self.fast_head_dim or self.head_dim
+        self.fast_intermediate_size = (
+            self.fast_intermediate_size or self.intermediate_size
+        )
+        self.fast_attention_qkv_bias = (
+            self.fast_attention_qkv_bias
+            if self.fast_attention_qkv_bias is not None
+            else self.attention_qkv_bias
+        )
+class KVCache(nn.Module):
+    def __init__(
+        self, max_batch_size, max_seq_len, n_heads, head_dim, dtype=torch.bfloat16
+    ):
+        super().__init__()
+        cache_shape = (max_batch_size, n_heads, max_seq_len, head_dim)
+        self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
+        self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))
+    def update(self, input_pos, k_val, v_val):
+        # input_pos: [S], k_val: [B, H, S, D]
+        assert input_pos.shape[0] == k_val.shape[2]
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out[:, :, input_pos] = k_val
+        v_out[:, :, input_pos] = v_val
+        return k_out, v_out
+@dataclass
+class TransformerForwardResult:
+    token_logits: Tensor
+    codebook_logits: Tensor
+@dataclass
+class BaseTransformerForwardResult:
+    logits: Tensor
+    hidden_states: Tensor
+class BaseTransformer(nn.Module):
+    def __init__(
+        self,
+        config: BaseModelArgs,
+        tokenizer: FishTokenizer | AutoTokenizer,
+        init_weights: bool = True,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.tokenizer = tokenizer
+        self.semantic_token_ids = [
+            tokenizer.get_token_id(SEMANTIC_TOKEN) for SEMANTIC_TOKEN in SEMANTIC_TOKENS
+        ]
+        # Slow transformer
+        self.embeddings = nn.Embedding(
+            config.vocab_size,
+            config.dim,
+        )
+        self.codebook_embeddings = nn.Embedding(
+            config.codebook_size * config.num_codebooks,
+            config.dim,
+        )
+        self.layers = nn.ModuleList(
+            TransformerBlock(config, use_sdpa=True) for _ in range(config.n_layer)
+        )
+        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+        if self.config.tie_word_embeddings is False:
+            self.output = nn.Linear(
+                config.dim,
+                config.vocab_size,
+                bias=False,
+            )
+        self.register_buffer(
+            "freqs_cis",
+            precompute_freqs_cis(
+                config.max_seq_len,
+                config.dim // config.n_head,
+                config.rope_base,
+            ),
+            persistent=False,
+        )
+        self.register_buffer(
+            "causal_mask",
+            torch.tril(
+                torch.ones(
+                    config.max_seq_len,
+                    config.max_seq_len,
+                    dtype=torch.bool,
+                )
+            ),
+            persistent=False,
+        )
+        # For kv cache
+        self.max_batch_size = -1
+        self.max_seq_len = -1
+        if init_weights:
+            self.apply(self._init_weights)
+    def setup_caches(
+        self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
+    ):
+        if self.max_seq_len >= max_seq_len and self.max_batch_size >= max_batch_size:
+            return
+        head_dim = self.config.dim // self.config.n_head
+        max_seq_len = find_multiple(max_seq_len, 8)
+        self.max_seq_len = max_seq_len
+        self.max_batch_size = max_batch_size
+        for b in self.layers:
+            b.attention.kv_cache = KVCache(
+                max_batch_size,
+                max_seq_len,
+                self.config.n_local_heads,
+                head_dim,
+                dtype=dtype,
+            )
+    def embed(self, x: Tensor) -> Tensor:
+        vocab_embeds = [self.embeddings(x[:, 0])]
+        for i in range(self.config.num_codebooks):
+            emb = self.codebook_embeddings(x[:, i + 1] + i * self.config.codebook_size)
+            semantic_token_ids_tensor = torch.tensor(
+                self.semantic_token_ids, device=x.device
+            )
+            emb[~torch.isin(x[:, 0], semantic_token_ids_tensor)] = 0
+        x = torch.stack(vocab_embeds, dim=3)
+        x = x.sum(dim=3)
+        return x
+    def forward(
+        self,
+        inp: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
+    ) -> BaseTransformerForwardResult:
+        seq_len = inp.size(2)
+        # Here we want to merge the embeddings of the codebooks
+        x = self.embed(inp)
+        freqs_cis = self.freqs_cis[:seq_len]
+        # Not that the causal mask here follows the definition of scaled_dot_product_attention
+        # That is, FALSE means masked out
+        # To maintain consistency, key_padding_mask use TRUE to mask out
+        mask = None
+        if key_padding_mask is not None:
+            mask = self.causal_mask[None, None, :seq_len, :seq_len]  # (B, N, Q, K)
+            mask = mask & key_padding_mask[:, None, None, :].logical_not()
+        for layer in self.layers:
+            if self.config.use_gradient_checkpointing and self.training:
+                x = checkpoint(layer, x, freqs_cis, mask, use_reentrant=True)
+            else:
+                x = layer(x, freqs_cis, mask)
+        # We got slow_out here
+        slow_out = self.norm(x)
+        if self.config.tie_word_embeddings:
+            token_logits = F.linear(slow_out, self.embeddings.weight)
+        else:
+            token_logits = self.output(slow_out)
+        return BaseTransformerForwardResult(
+            logits=token_logits,
+            hidden_states=x,
+        )
+    def forward_generate(
+        self,
+        inp: Tensor,
+        input_pos: Optional[Tensor] = None,
+        vq_masks: Optional[Tensor] = None,  # this is not used in fact
+        return_all: bool = False,
+    ) -> BaseTransformerForwardResult:
+        # This is used for generation, optimized for torch compile
+        # assert (
+        #     self.max_seq_len != -1 and self.max_batch_size != -1
+        # ), "Please call setup_caches before forward_generate"
+        embeds = []
+        for i in range(self.config.num_codebooks):
+            if self.config.share_codebook_embeddings:
+                _tokens = inp[:, i + 1] + i * self.config.codebook_size
+            else:
+                _tokens = inp[:, i + 1]
+            emb = self.codebook_embeddings(_tokens)
+            embeds.append(emb)
+        vq_embeds_sum = torch.stack(embeds, dim=1).sum(dim=1)
+        # if self.config.use_codebook_mlp:
+        #     vq_embeds_sum = vq_embeds_sum / self.config.num_codebooks
+        #     vq_embeds_sum = self.codebook_mlp(vq_embeds_sum)
+        vq_masks = (inp[:, 0] >= self.tokenizer.semantic_begin_id) & (
+            inp[:, 0] <= self.tokenizer.semantic_end_id
+        )
+        vq_embeds_sum[~vq_masks] = 0
+        x = self.embeddings(inp[:, 0]) + vq_embeds_sum
+        if input_pos is None:
+            input_pos = torch.arange(inp.shape[-1], device=x.device)
+            max_seq_len = inp.shape[-1]
+        else:
+            max_seq_len = self.max_seq_len
+        mask = self.causal_mask[None, None, input_pos, :max_seq_len]  # (B, N, Q, K)
+        freqs_cis = self.freqs_cis[input_pos]
+        for layer in self.layers:
+            x = layer(x, freqs_cis, mask, input_pos=input_pos)
+        # If prefill, we only calculate the logits of last token
+        if x.size(1) > 1 and not return_all:
+            x = x[:, -1:]
+        # We got slow_out here
+        slow_out = self.norm(x)
+        if self.config.is_reward_model:
+            token_logits = self.score_output(slow_out)
+        elif self.config.tie_word_embeddings:
+            token_logits = F.linear(slow_out, self.embeddings.weight)
+        else:
+            token_logits = self.output(slow_out)
+        return BaseTransformerForwardResult(
+            logits=token_logits,
+            hidden_states=x,
+        )
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @staticmethod
+    def from_pretrained(
+        path: str,
+        load_weights: bool = False,
+        max_length: int | None = None,
+        lora_config: LoraConfig | None = None,
+        rope_base: int | None = None,
+        is_agent: bool = False,
+    ) -> "BaseTransformer":
+        config = BaseModelArgs.from_pretrained(str(path))
+        if max_length is not None:
+            config.max_seq_len = max_length
+            log.info(f"Override max_seq_len to {max_length}")
+        if rope_base is not None:
+            config.rope_base = rope_base
+            log.info(f"Override rope_base to {rope_base}")
+        match config.model_type:
+            case "naive":
+                model_cls = NaiveTransformer
+            case "dual_ar":
+                model_cls = DualARTransformer
+            case _:
+                raise ValueError(f"Unknown model type: {config.model_type}")
+        if is_agent:
+            tokenizer = AutoTokenizer.from_pretrained(str(path))
+        else:
+            tokenizer_path = str(path) + "/tokenizer.tiktoken"
+            tokenizer = FishTokenizer(tokenizer_path)
+        log.info(f"Loading model from {path}, config: {config}")
+        model = model_cls(config, tokenizer=tokenizer)
+        if lora_config is not None:
+            setup_lora(model, lora_config)
+            log.info(f"LoRA setup: {lora_config}")
+        if load_weights is False:
+            log.info("Randomly initialized model")
+        else:
+            if "int8" in str(Path(path)):
+                logger.info("Using int8 weight-only quantization!")
+                from tools.llama.quantize import WeightOnlyInt8QuantHandler
+                simple_quantizer = WeightOnlyInt8QuantHandler(model)
+                model = simple_quantizer.convert_for_runtime()
+            if "int4" in str(Path(path)):
+                logger.info("Using int4 quantization!")
+                path_comps = path.name.split("-")
+                assert path_comps[-2].startswith("g")
+                groupsize = int(path_comps[-2][1:])
+                from tools.llama.quantize import WeightOnlyInt4QuantHandler
+                simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
+                model = simple_quantizer.convert_for_runtime()
+            weights = torch.load(
+                Path(path) / "model.pth",
+                map_location="cpu",
+                mmap=True,
+                weights_only=True,
+            )
+            if "state_dict" in weights:
+                logger.warning(
+                    "Using a TextToSemantic LightningModule checkpoint, "
+                    "please make sure it is a full model, not a LoRA model."
+                )
+                weights = weights["state_dict"]
+            if next(iter(weights.keys())).startswith("model."):
+                logger.info(
+                    f"Remove prefix 'model.' created by TextToSemantic LightningModule from keys"
+                )
+                new_weights = OrderedDict()
+                for k, v in weights.items():
+                    new_weights[k.replace("model.", "")] = v
+                weights = new_weights
+            # Verify the name and shape of parameters since strict=False in load_state_dict.
+            for k, v in model.named_parameters():
+                if k not in weights:
+                    logger.warning(f"No weight for {k}")
+                elif v.shape != weights[k].shape:
+                    logger.warning(
+                        f"Shape mismatch for {k}: {v.shape} vs {weights[k].shape}"
+                    )
+            err = model.load_state_dict(weights, strict=False, assign=True)
+            log.info(f"Loaded weights with error: {err}")
+        return model
+    def save_pretrained(self, path: str, drop_lora: bool = False):
+        path = Path(path)
+        path.mkdir(parents=True, exist_ok=True)
+        self.config.save(path / "config.json")
+        state_dict = self.state_dict()
+        if drop_lora:
+            for key in list(state_dict.keys()):
+                if "lora" not in key:
+                    continue
+                state_dict.pop(key)
+                log.info(f"Drop LoRA parameter: {key}")
+        torch.save(state_dict, path / "model.pth")
+        self.tokenizer.save_pretrained(path)
+class NaiveTransformer(BaseTransformer):
+    def __init__(self, config: NaiveModelArgs, tokenizer: FishTokenizer) -> None:
+        super().__init__(config, init_weights=False, tokenizer=tokenizer)
+        self.codebook_norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.codebook_output = nn.Linear(
+            config.dim,
+            config.codebook_size * config.num_codebooks,
+            bias=False,
+        )
+        self.apply(self._init_weights)
+    def decode(self, result: BaseTransformerForwardResult) -> TransformerForwardResult:
+        token_logits = result.logits
+        x = result.hidden_states
+        # Codebook
+        codebook_logits = self.codebook_output(self.codebook_norm(x))
+        codebook_logits = rearrange(
+            codebook_logits, "b n (c d) -> b n c d", c=self.config.num_codebooks
+        )
+        return TransformerForwardResult(
+            token_logits=token_logits,
+            codebook_logits=codebook_logits,
+        )
+    def forward(
+        self,
+        inp: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
+    ) -> TransformerForwardResult:
+        result = super().forward(
+            inp=inp,
+            key_padding_mask=key_padding_mask,
+        )
+        return self.decode(result)
+    def forward_generate(
+        self, x: Tensor, input_pos: Optional[Tensor] = None
+    ) -> TransformerForwardResult:
+        result = super().forward_generate(x, input_pos)
+        return self.decode(result)
+class DualARTransformer(BaseTransformer):
+    def __init__(self, config: NaiveModelArgs, tokenizer: FishTokenizer) -> None:
+        super().__init__(config, init_weights=False, tokenizer=tokenizer)
+        # Project to fast dim if needed
+        if config.fast_dim is not None and config.fast_dim != config.dim:
+            self.fast_project_in = nn.Linear(config.dim, config.fast_dim)
+        else:
+            self.fast_project_in = nn.Identity()
+        # Fast transformer
+        self.fast_embeddings = nn.Embedding(config.codebook_size, config.fast_dim)
+        # The equivalent bs is so large that sdpa doesn't work
+        override_config = dataclasses.replace(
+            config,
+            dim=config.fast_dim,
+            n_head=config.fast_n_head,
+            n_local_heads=config.fast_n_local_heads,
+            head_dim=config.fast_head_dim,
+            intermediate_size=config.fast_intermediate_size,
+            attention_qkv_bias=config.fast_attention_qkv_bias,
+        )
+        self.fast_layers = nn.ModuleList(
+            TransformerBlock(override_config, use_sdpa=False)
+            for _ in range(config.n_fast_layer)
+        )
+        self.fast_norm = RMSNorm(config.fast_dim, eps=config.norm_eps)
+        self.fast_output = nn.Linear(
+            config.fast_dim,
+            config.codebook_size,
+            bias=False,
+        )
+        self.register_buffer(
+            "fast_freqs_cis",
+            precompute_freqs_cis(
+                config.num_codebooks,
+                config.fast_dim // config.fast_n_head,
+                config.rope_base,
+            ),
+            persistent=False,
+        )
+        self.apply(self._init_weights)
+    def setup_caches(
+        self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
+    ):
+        super().setup_caches(max_batch_size, max_seq_len, dtype)
+        head_dim = self.config.fast_dim // self.config.fast_n_head
+        # Fast transformer
+        # The max seq len here is the number of codebooks
+        for b in self.fast_layers:
+            b.attention.kv_cache = KVCache(
+                max_batch_size,
+                self.config.num_codebooks,
+                self.config.fast_n_local_heads,
+                head_dim,
+                dtype=dtype,
+            )
+    def forward(
+        self,
+        inp: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
+    ) -> TransformerForwardResult:
+        parent_result = super().forward(inp, key_padding_mask)
+        token_logits = parent_result.logits
+        x = parent_result.hidden_states
+        x = self.fast_project_in(x)
+        # Fast transformer
+        fast_seq_len = self.config.num_codebooks
+        fast_mask = self.causal_mask[
+            None, None, :fast_seq_len, :fast_seq_len
+        ]  # (B, N, Q, K)
+        # Drop the last token and rotate left
+        codebooks = inp[:, 1:-1, 1:]
+        codebooks = F.pad(codebooks, (0, 1), value=0)
+        codebook_embeddings = self.fast_embeddings(codebooks)
+        x = torch.cat([x[:, None], codebook_embeddings], dim=1)
+        b, s = x.size(0), x.size(2)
+        x = rearrange(x, "b n s d -> (b s) n d")  # flatten the batch and seq_len
+        # Remove padded part
+        codebooks = rearrange(codebooks, "b n s -> (b s) n")
+        codebook_mask = (codebooks == 0).all(dim=-1)
+        if torch.all(codebook_mask):
+            # If all codebooks are padded, we keep first 8 to make sure the model runs
+            codebook_mask[:8] = False
+        x_bs, x_len = x.size(0), x.size(1)
+        x = x[~codebook_mask]
+        for layer in self.fast_layers:
+            if self.config.use_gradient_checkpointing and self.training:
+                x = checkpoint(
+                    layer, x, self.fast_freqs_cis, fast_mask, use_reentrant=True
+                )
+            else:
+                x = layer(x, self.fast_freqs_cis, fast_mask)
+        # unflatten the batch and num_codebooks
+        fast_out = self.fast_norm(x)
+        codebook_logits = self.fast_output(fast_out)
+        # Re-pad the codebook_logits
+        buffer = torch.zeros(
+            x_bs,
+            x_len,
+            codebook_logits.size(-1),
+            device=codebook_logits.device,
+            dtype=codebook_logits.dtype,
+        )
+        buffer[~codebook_mask] = codebook_logits
+        codebook_logits = buffer
+        assert codebook_logits.shape[1] == self.config.num_codebooks
+        codebook_logits = rearrange(
+            codebook_logits,
+            "(b s) n d -> b s n d",
+            b=b,
+            s=s,
+            n=self.config.num_codebooks,
+        )
+        return TransformerForwardResult(
+            token_logits=token_logits,
+            codebook_logits=codebook_logits,
+        )
+    def forward_generate_fast(
+        self, x: Tensor, input_pos: Optional[Tensor] = None
+    ) -> Tensor:
+        # Fast transformer
+        x = x.view(1, 1, -1)
+        fast_mask = self.causal_mask[
+            None, None, input_pos, : self.config.num_codebooks
+        ]  # (B, N, Q, K)
+        fast_freqs_cis = self.fast_freqs_cis[input_pos]
+        for layer in self.fast_layers:
+            x = layer(x, fast_freqs_cis, fast_mask, input_pos=input_pos)
+        # unflatten the batch and num_codebooks
+        fast_out = self.fast_norm(x)  # only take the last token
+        codebook_logits = self.fast_output(fast_out)
+        return codebook_logits
+    def forward_generate(
+        self,
+        x: Tensor,
+        input_pos: Optional[Tensor] = None,
+        vq_masks: Optional[Tensor] = None,
+    ) -> TransformerForwardResult:
+        x = super().forward_generate(x, input_pos, vq_masks)
+        x.hidden_states = self.fast_project_in(x.hidden_states)
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, config: BaseModelArgs, use_sdpa: bool = True) -> None:
+        super().__init__()
+        self.attention = Attention(config, use_sdpa=use_sdpa)
+        self.feed_forward = FeedForward(config)
+        self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
+        self.attention_norm = RMSNorm(config.dim, config.norm_eps)
+    def forward(
+        self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Tensor = None
+    ) -> Tensor:
+        h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+class Attention(nn.Module):
+    def __init__(self, config: BaseModelArgs, use_sdpa: bool = True):
+        super().__init__()
+        assert config.dim % config.n_head == 0
+        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
+        # key, query, value projections for all heads, but in a batch
+        self.wqkv = nn.Linear(
+            config.dim, total_head_dim, bias=config.attention_qkv_bias
+        )
+        self.wo = nn.Linear(config.dim, config.dim, bias=False)
+        self.kv_cache = None
+        self.dropout = config.dropout
+        self.n_head = config.n_head
+        self.head_dim = config.head_dim
+        self.n_local_heads = config.n_local_heads
+        self.dim = config.dim
+        self.use_sdpa = use_sdpa
+        self._register_load_state_dict_pre_hook(self.load_hook)
+    def load_hook(self, state_dict, prefix, *args):
+        if prefix + "wq.weight" in state_dict:
+            wq = state_dict.pop(prefix + "wq.weight")
+            wk = state_dict.pop(prefix + "wk.weight")
+            wv = state_dict.pop(prefix + "wv.weight")
+            state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])
+    def forward(
+        self,
+        x: Tensor,
+        freqs_cis: Tensor,
+        mask: Tensor,
+        input_pos: Optional[Tensor] = None,
+    ) -> Tensor:
+        bsz, seqlen, _ = x.shape
+        kv_size = self.n_local_heads * self.head_dim
+        q, k, v = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        q = apply_rotary_emb(q, freqs_cis)
+        k = apply_rotary_emb(k, freqs_cis)
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+        if self.kv_cache is not None:
+            k, v = self.kv_cache.update(input_pos, k, v)
+        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        if self.use_sdpa:
+            if mask is None:
+                with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+                    y = F.scaled_dot_product_attention(
+                        q,
+                        k,
+                        v,
+                        dropout_p=self.dropout if self.training else 0.0,
+                        is_causal=True,
+                        # No third party attn_mask here to use flash_attention
+                    )
+            else:
+                y = F.scaled_dot_product_attention(
+                    q,
+                    k,
+                    v,
+                    attn_mask=mask,
+                    dropout_p=self.dropout if self.training else 0.0,
+                )
+        else:
+            y = self.eq_scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=mask,
+                dropout_p=self.dropout if self.training else 0.0,
+            )
+        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+        return self.wo(y)
+    def eq_scaled_dot_product_attention(
+        self,
+        query,
+        key,
+        value,
+        attn_mask=None,
+        dropout_p=0.0,
+    ) -> torch.Tensor:
+        # This is a standard scaled dot product attention
+        # It's low efficient, but it doesn't raise cuda error
+        L, S = query.size(-2), key.size(-2)
+        scale_factor = 1 / math.sqrt(query.size(-1))
+        attn_bias = torch.zeros(1, 1, L, S, dtype=query.dtype, device=query.device)
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask
+        attn_weight = query @ key.transpose(-2, -1) * scale_factor
+        attn_weight += attn_bias
+        attn_weight = torch.softmax(attn_weight, dim=-1)
+        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+        return attn_weight @ value
+class FeedForward(nn.Module):
+    def __init__(self, config: BaseModelArgs) -> None:
+        super().__init__()
+        self.w1 = nn.Linear(config.dim, config.intermediate_size, bias=False)
+        self.w3 = nn.Linear(config.dim, config.intermediate_size, bias=False)
+        self.w2 = nn.Linear(config.intermediate_size, config.dim, bias=False)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+    def forward(self, x: Tensor) -> Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000) -> Tensor:
+    freqs = 1.0 / (
+        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
+    )
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
+    return cache.to(dtype=torch.bfloat16)
+def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
+    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
+            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)

fish_speech/models/text2semantic/lora.py CHANGED Viewed

@@ -1,92 +1,92 @@
-from dataclasses import dataclass
-import loralib as lora
-@dataclass
-class LoraConfig:
-    r: int
-    lora_alpha: float
-    lora_dropout: float = 0.0
-def setup_lora(model, lora_config):
-    # Replace the embedding layer with a LoRA layer
-    model.embeddings = lora.Embedding(
-        num_embeddings=model.embeddings.num_embeddings,
-        embedding_dim=model.embeddings.embedding_dim,
-        padding_idx=model.embeddings.padding_idx,
-        r=lora_config.r,
-        lora_alpha=lora_config.lora_alpha,
-    )
-    model.codebook_embeddings = lora.Embedding(
-        num_embeddings=model.codebook_embeddings.num_embeddings,
-        embedding_dim=model.codebook_embeddings.embedding_dim,
-        padding_idx=model.codebook_embeddings.padding_idx,
-        r=lora_config.r,
-        lora_alpha=lora_config.lora_alpha,
-    )
-    # Replace output layer with a LoRA layer
-    linears = [(model, "output")]
-    # Replace all linear layers with LoRA layers
-    for layer in model.layers:
-        linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
-        linears.extend(
-            [
-                (layer.feed_forward, "w1"),
-                (layer.feed_forward, "w2"),
-                (layer.feed_forward, "w3"),
-            ]
-        )
-    if hasattr(model, "fast_layers"):
-        model.fast_embeddings = lora.Embedding(
-            num_embeddings=model.fast_embeddings.num_embeddings,
-            embedding_dim=model.fast_embeddings.embedding_dim,
-            padding_idx=model.fast_embeddings.padding_idx,
-            r=lora_config.r,
-            lora_alpha=lora_config.lora_alpha,
-        )
-        # Dual-AR model
-        linears.append((model, "fast_output"))
-        for layer in model.fast_layers:
-            linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
-            linears.extend(
-                [
-                    (layer.feed_forward, "w1"),
-                    (layer.feed_forward, "w2"),
-                    (layer.feed_forward, "w3"),
-                ]
-            )
-    for module, layer in linears:
-        updated_linear = lora.Linear(
-            in_features=getattr(module, layer).in_features,
-            out_features=getattr(module, layer).out_features,
-            bias=getattr(module, layer).bias,
-            r=lora_config.r,
-            lora_alpha=lora_config.lora_alpha,
-            lora_dropout=lora_config.lora_dropout,
-        )
-        setattr(module, layer, updated_linear)
-    # Mark only the LoRA layers as trainable
-    lora.mark_only_lora_as_trainable(model, bias="none")
-def get_merged_state_dict(model):
-    # This line will merge the state dict of the model and the LoRA parameters
-    model.eval()
-    # Then we need to remove the LoRA parameters from the state dict
-    state_dict = model.state_dict()
-    for name in list(state_dict.keys()):
-        if "lora" in name:
-            state_dict.pop(name)
-    return state_dict

+from dataclasses import dataclass
+import loralib as lora
+@dataclass
+class LoraConfig:
+    r: int
+    lora_alpha: float
+    lora_dropout: float = 0.0
+def setup_lora(model, lora_config):
+    # Replace the embedding layer with a LoRA layer
+    model.embeddings = lora.Embedding(
+        num_embeddings=model.embeddings.num_embeddings,
+        embedding_dim=model.embeddings.embedding_dim,
+        padding_idx=model.embeddings.padding_idx,
+        r=lora_config.r,
+        lora_alpha=lora_config.lora_alpha,
+    )
+    model.codebook_embeddings = lora.Embedding(
+        num_embeddings=model.codebook_embeddings.num_embeddings,
+        embedding_dim=model.codebook_embeddings.embedding_dim,
+        padding_idx=model.codebook_embeddings.padding_idx,
+        r=lora_config.r,
+        lora_alpha=lora_config.lora_alpha,
+    )
+    # Replace output layer with a LoRA layer
+    linears = [(model, "output")]
+    # Replace all linear layers with LoRA layers
+    for layer in model.layers:
+        linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
+        linears.extend(
+            [
+                (layer.feed_forward, "w1"),
+                (layer.feed_forward, "w2"),
+                (layer.feed_forward, "w3"),
+            ]
+        )
+    if hasattr(model, "fast_layers"):
+        model.fast_embeddings = lora.Embedding(
+            num_embeddings=model.fast_embeddings.num_embeddings,
+            embedding_dim=model.fast_embeddings.embedding_dim,
+            padding_idx=model.fast_embeddings.padding_idx,
+            r=lora_config.r,
+            lora_alpha=lora_config.lora_alpha,
+        )
+        # Dual-AR model
+        linears.append((model, "fast_output"))
+        for layer in model.fast_layers:
+            linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
+            linears.extend(
+                [
+                    (layer.feed_forward, "w1"),
+                    (layer.feed_forward, "w2"),
+                    (layer.feed_forward, "w3"),
+                ]
+            )
+    for module, layer in linears:
+        updated_linear = lora.Linear(
+            in_features=getattr(module, layer).in_features,
+            out_features=getattr(module, layer).out_features,
+            bias=getattr(module, layer).bias,
+            r=lora_config.r,
+            lora_alpha=lora_config.lora_alpha,
+            lora_dropout=lora_config.lora_dropout,
+        )
+        setattr(module, layer, updated_linear)
+    # Mark only the LoRA layers as trainable
+    lora.mark_only_lora_as_trainable(model, bias="none")
+def get_merged_state_dict(model):
+    # This line will merge the state dict of the model and the LoRA parameters
+    model.eval()
+    # Then we need to remove the LoRA parameters from the state dict
+    state_dict = model.state_dict()
+    for name in list(state_dict.keys()):
+        if "lora" in name:
+            state_dict.pop(name)
+    return state_dict

fish_speech/models/vqgan/lit_module.py DELETED Viewed

@@ -1,442 +0,0 @@
-import itertools
-import math
-from typing import Any, Callable
-import lightning as L
-import torch
-import torch.nn.functional as F
-import wandb
-from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
-from matplotlib import pyplot as plt
-from torch import nn
-from fish_speech.models.vqgan.modules.discriminator import Discriminator
-from fish_speech.models.vqgan.modules.wavenet import WaveNet
-from fish_speech.models.vqgan.utils import avg_with_mask, plot_mel, sequence_mask
-class VQGAN(L.LightningModule):
-    def __init__(
-        self,
-        optimizer: Callable,
-        lr_scheduler: Callable,
-        encoder: WaveNet,
-        quantizer: nn.Module,
-        decoder: WaveNet,
-        discriminator: Discriminator,
-        vocoder: nn.Module,
-        encode_mel_transform: nn.Module,
-        gt_mel_transform: nn.Module,
-        weight_adv: float = 1.0,
-        weight_vq: float = 1.0,
-        weight_mel: float = 1.0,
-        sampling_rate: int = 44100,
-        freeze_encoder: bool = False,
-    ):
-        super().__init__()
-        # Model parameters
-        self.optimizer_builder = optimizer
-        self.lr_scheduler_builder = lr_scheduler
-        # Modules
-        self.encoder = encoder
-        self.quantizer = quantizer
-        self.decoder = decoder
-        self.vocoder = vocoder
-        self.discriminator = discriminator
-        self.encode_mel_transform = encode_mel_transform
-        self.gt_mel_transform = gt_mel_transform
-        # A simple linear layer to project quality to condition channels
-        self.quality_projection = nn.Linear(1, 768)
-        # Freeze vocoder
-        for param in self.vocoder.parameters():
-            param.requires_grad = False
-        # Loss weights
-        self.weight_adv = weight_adv
-        self.weight_vq = weight_vq
-        self.weight_mel = weight_mel
-        # Other parameters
-        self.sampling_rate = sampling_rate
-        # Disable strict loading
-        self.strict_loading = False
-        # If encoder is frozen
-        if freeze_encoder:
-            for param in self.encoder.parameters():
-                param.requires_grad = False
-            for param in self.quantizer.parameters():
-                param.requires_grad = False
-        self.automatic_optimization = False
-    def on_save_checkpoint(self, checkpoint):
-        # Do not save vocoder
-        state_dict = checkpoint["state_dict"]
-        for name in list(state_dict.keys()):
-            if "vocoder" in name:
-                state_dict.pop(name)
-    def configure_optimizers(self):
-        optimizer_generator = self.optimizer_builder(
-            itertools.chain(
-                self.encoder.parameters(),
-                self.quantizer.parameters(),
-                self.decoder.parameters(),
-                self.quality_projection.parameters(),
-            )
-        )
-        optimizer_discriminator = self.optimizer_builder(
-            self.discriminator.parameters()
-        )
-        lr_scheduler_generator = self.lr_scheduler_builder(optimizer_generator)
-        lr_scheduler_discriminator = self.lr_scheduler_builder(optimizer_discriminator)
-        return (
-            {
-                "optimizer": optimizer_generator,
-                "lr_scheduler": {
-                    "scheduler": lr_scheduler_generator,
-                    "interval": "step",
-                    "name": "optimizer/generator",
-                },
-            },
-            {
-                "optimizer": optimizer_discriminator,
-                "lr_scheduler": {
-                    "scheduler": lr_scheduler_discriminator,
-                    "interval": "step",
-                    "name": "optimizer/discriminator",
-                },
-            },
-        )
-    def training_step(self, batch, batch_idx):
-        optim_g, optim_d = self.optimizers()
-        audios, audio_lengths = batch["audios"], batch["audio_lengths"]
-        audios = audios.float()
-        audios = audios[:, None, :]
-        with torch.no_grad():
-            encoded_mels = self.encode_mel_transform(audios)
-            gt_mels = self.gt_mel_transform(audios)
-            quality = ((gt_mels.mean(-1) > -8).sum(-1) - 90) / 10
-            quality = quality.unsqueeze(-1)
-        mel_lengths = audio_lengths // self.gt_mel_transform.hop_length
-        mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2])
-        mel_masks_float_conv = mel_masks[:, None, :].float()
-        gt_mels = gt_mels * mel_masks_float_conv
-        encoded_mels = encoded_mels * mel_masks_float_conv
-        # Encode
-        encoded_features = self.encoder(encoded_mels) * mel_masks_float_conv
-        # Quantize
-        vq_result = self.quantizer(encoded_features)
-        loss_vq = getattr("vq_result", "loss", 0.0)
-        vq_recon_features = vq_result.z * mel_masks_float_conv
-        vq_recon_features = (
-            vq_recon_features + self.quality_projection(quality)[:, :, None]
-        )
-        # VQ Decode
-        gen_mel = (
-            self.decoder(
-                torch.randn_like(vq_recon_features) * mel_masks_float_conv,
-                condition=vq_recon_features,
-            )
-            * mel_masks_float_conv
-        )
-        # Discriminator
-        real_logits = self.discriminator(gt_mels)
-        fake_logits = self.discriminator(gen_mel.detach())
-        d_mask = F.interpolate(
-            mel_masks_float_conv, size=(real_logits.shape[2],), mode="nearest"
-        )
-        loss_real = avg_with_mask((real_logits - 1) ** 2, d_mask)
-        loss_fake = avg_with_mask(fake_logits**2, d_mask)
-        loss_d = loss_real + loss_fake
-        self.log(
-            "train/discriminator/loss",
-            loss_d,
-            on_step=True,
-            on_epoch=False,
-            prog_bar=True,
-            logger=True,
-        )
-        # Discriminator backward
-        optim_d.zero_grad()
-        self.manual_backward(loss_d)
-        self.clip_gradients(
-            optim_d, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
-        )
-        optim_d.step()
-        # Mel Loss, applying l1, using a weighted sum
-        mel_distance = (
-            gen_mel - gt_mels
-        ).abs()  # * 0.5 + self.ssim(gen_mel, gt_mels) * 0.5
-        loss_mel_low_freq = avg_with_mask(mel_distance[:, :40, :], mel_masks_float_conv)
-        loss_mel_mid_freq = avg_with_mask(
-            mel_distance[:, 40:70, :], mel_masks_float_conv
-        )
-        loss_mel_high_freq = avg_with_mask(
-            mel_distance[:, 70:, :], mel_masks_float_conv
-        )
-        loss_mel = (
-            loss_mel_low_freq * 0.6 + loss_mel_mid_freq * 0.3 + loss_mel_high_freq * 0.1
-        )
-        # Adversarial Loss
-        fake_logits = self.discriminator(gen_mel)
-        loss_adv = avg_with_mask((fake_logits - 1) ** 2, d_mask)
-        # Total loss
-        loss = (
-            self.weight_vq * loss_vq
-            + self.weight_mel * loss_mel
-            + self.weight_adv * loss_adv
-        )
-        # Log losses
-        self.log(
-            "train/generator/loss",
-            loss,
-            on_step=True,
-            on_epoch=False,
-            prog_bar=True,
-            logger=True,
-        )
-        self.log(
-            "train/generator/loss_vq",
-            loss_vq,
-            on_step=True,
-            on_epoch=False,
-            prog_bar=False,
-            logger=True,
-        )
-        self.log(
-            "train/generator/loss_mel",
-            loss_mel,
-            on_step=True,
-            on_epoch=False,
-            prog_bar=False,
-            logger=True,
-        )
-        self.log(
-            "train/generator/loss_adv",
-            loss_adv,
-            on_step=True,
-            on_epoch=False,
-            prog_bar=False,
-            logger=True,
-        )
-        # Generator backward
-        optim_g.zero_grad()
-        self.manual_backward(loss)
-        self.clip_gradients(
-            optim_g, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
-        )
-        optim_g.step()
-        scheduler_g, scheduler_d = self.lr_schedulers()
-        scheduler_g.step()
-        scheduler_d.step()
-    def validation_step(self, batch: Any, batch_idx: int):
-        audios, audio_lengths = batch["audios"], batch["audio_lengths"]
-        audios = audios.float()
-        audios = audios[:, None, :]
-        encoded_mels = self.encode_mel_transform(audios)
-        gt_mels = self.gt_mel_transform(audios)
-        mel_lengths = audio_lengths // self.gt_mel_transform.hop_length
-        mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2])
-        mel_masks_float_conv = mel_masks[:, None, :].float()
-        gt_mels = gt_mels * mel_masks_float_conv
-        encoded_mels = encoded_mels * mel_masks_float_conv
-        # Encode
-        encoded_features = self.encoder(encoded_mels) * mel_masks_float_conv
-        # Quantize
-        vq_recon_features = self.quantizer(encoded_features).z * mel_masks_float_conv
-        vq_recon_features = (
-            vq_recon_features
-            + self.quality_projection(
-                torch.ones(
-                    vq_recon_features.shape[0], 1, device=vq_recon_features.device
-                )
-                * 2
-            )[:, :, None]
-        )
-        # VQ Decode
-        gen_aux_mels = (
-            self.decoder(
-                torch.randn_like(vq_recon_features) * mel_masks_float_conv,
-                condition=vq_recon_features,
-            )
-            * mel_masks_float_conv
-        )
-        loss_mel = avg_with_mask((gen_aux_mels - gt_mels).abs(), mel_masks_float_conv)
-        self.log(
-            "val/loss_mel",
-            loss_mel,
-            on_step=False,
-            on_epoch=True,
-            prog_bar=False,
-            logger=True,
-            sync_dist=True,
-        )
-        recon_audios = self.vocoder(gt_mels)
-        gen_aux_audios = self.vocoder(gen_aux_mels)
-        # only log the first batch
-        if batch_idx != 0:
-            return
-        for idx, (
-            gt_mel,
-            gen_aux_mel,
-            audio,
-            gen_aux_audio,
-            recon_audio,
-            audio_len,
-        ) in enumerate(
-            zip(
-                gt_mels,
-                gen_aux_mels,
-                audios.cpu().float(),
-                gen_aux_audios.cpu().float(),
-                recon_audios.cpu().float(),
-                audio_lengths,
-            )
-        ):
-            if idx > 4:
-                break
-            mel_len = audio_len // self.gt_mel_transform.hop_length
-            image_mels = plot_mel(
-                [
-                    gt_mel[:, :mel_len],
-                    gen_aux_mel[:, :mel_len],
-                ],
-                [
-                    "Ground-Truth",
-                    "Auxiliary",
-                ],
-            )
-            if isinstance(self.logger, WandbLogger):
-                self.logger.experiment.log(
-                    {
-                        "reconstruction_mel": wandb.Image(image_mels, caption="mels"),
-                        "wavs": [
-                            wandb.Audio(
-                                audio[0, :audio_len],
-                                sample_rate=self.sampling_rate,
-                                caption="gt",
-                            ),
-                            wandb.Audio(
-                                gen_aux_audio[0, :audio_len],
-                                sample_rate=self.sampling_rate,
-                                caption="aux",
-                            ),
-                            wandb.Audio(
-                                recon_audio[0, :audio_len],
-                                sample_rate=self.sampling_rate,
-                                caption="recon",
-                            ),
-                        ],
-                    },
-                )
-            if isinstance(self.logger, TensorBoardLogger):
-                self.logger.experiment.add_figure(
-                    f"sample-{idx}/mels",
-                    image_mels,
-                    global_step=self.global_step,
-                )
-                self.logger.experiment.add_audio(
-                    f"sample-{idx}/wavs/gt",
-                    audio[0, :audio_len],
-                    self.global_step,
-                    sample_rate=self.sampling_rate,
-                )
-                self.logger.experiment.add_audio(
-                    f"sample-{idx}/wavs/gen",
-                    gen_aux_audio[0, :audio_len],
-                    self.global_step,
-                    sample_rate=self.sampling_rate,
-                )
-                self.logger.experiment.add_audio(
-                    f"sample-{idx}/wavs/recon",
-                    recon_audio[0, :audio_len],
-                    self.global_step,
-                    sample_rate=self.sampling_rate,
-                )
-            plt.close(image_mels)
-    def encode(self, audios, audio_lengths):
-        audios = audios.float()
-        mels = self.encode_mel_transform(audios)
-        mel_lengths = audio_lengths // self.encode_mel_transform.hop_length
-        mel_masks = sequence_mask(mel_lengths, mels.shape[2])
-        mel_masks_float_conv = mel_masks[:, None, :].float()
-        mels = mels * mel_masks_float_conv
-        # Encode
-        encoded_features = self.encoder(mels) * mel_masks_float_conv
-        feature_lengths = mel_lengths // math.prod(self.quantizer.downsample_factor)
-        return self.quantizer.encode(encoded_features), feature_lengths
-    def decode(self, indices, feature_lengths, return_audios=False):
-        factor = math.prod(self.quantizer.downsample_factor)
-        mel_masks = sequence_mask(feature_lengths * factor, indices.shape[2] * factor)
-        mel_masks_float_conv = mel_masks[:, None, :].float()
-        z = self.quantizer.decode(indices) * mel_masks_float_conv
-        z = (
-            z
-            + self.quality_projection(torch.ones(z.shape[0], 1, device=z.device) * 2)[
-                :, :, None
-            ]
-        )
-        gen_mel = (
-            self.decoder(
-                torch.randn_like(z) * mel_masks_float_conv,
-                condition=z,
-            )
-            * mel_masks_float_conv
-        )
-        if return_audios:
-            return self.vocoder(gen_mel)
-        return gen_mel

fish_speech/models/vqgan/modules/discriminator.py DELETED Viewed

@@ -1,44 +0,0 @@
-import torch
-from torch import nn
-from torch.nn.utils.parametrizations import weight_norm
-class Discriminator(nn.Module):
-    def __init__(self):
-        super().__init__()
-        blocks = []
-        convs = [
-            (1, 64, (3, 9), 1, (1, 4)),
-            (64, 128, (3, 9), (1, 2), (1, 4)),
-            (128, 256, (3, 9), (1, 2), (1, 4)),
-            (256, 512, (3, 9), (1, 2), (1, 4)),
-            (512, 1024, (3, 3), 1, (1, 1)),
-            (1024, 1, (3, 3), 1, (1, 1)),
-        ]
-        for idx, (in_channels, out_channels, kernel_size, stride, padding) in enumerate(
-            convs
-        ):
-            blocks.append(
-                weight_norm(
-                    nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
-                )
-            )
-            if idx != len(convs) - 1:
-                blocks.append(nn.SiLU(inplace=True))
-        self.blocks = nn.Sequential(*blocks)
-    def forward(self, x):
-        return self.blocks(x[:, None])[:, 0]
-if __name__ == "__main__":
-    model = Discriminator()
-    print(sum(p.numel() for p in model.parameters()) / 1_000_000)
-    x = torch.randn(1, 128, 1024)
-    y = model(x)
-    print(y.shape)
-    print(y)

fish_speech/models/vqgan/modules/firefly.py CHANGED Viewed

@@ -1,596 +1,596 @@
-import math
-from functools import partial
-from math import prod
-from typing import Callable
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch.nn.utils.parametrizations import weight_norm
-from torch.nn.utils.parametrize import remove_parametrizations
-from torch.utils.checkpoint import checkpoint
-def sequence_mask(length, max_length=None):
-    if max_length is None:
-        max_length = length.max()
-    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
-    return x.unsqueeze(0) < length.unsqueeze(1)
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find("Conv1D") != -1:
-        m.weight.data.normal_(mean, std)
-def get_padding(kernel_size, dilation=1):
-    return (kernel_size * dilation - dilation) // 2
-def unpad1d(x: torch.Tensor, paddings: tuple[int, int]):
-    """Remove padding from x, handling properly zero padding. Only for 1d!"""
-    padding_left, padding_right = paddings
-    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
-    assert (padding_left + padding_right) <= x.shape[-1]
-    end = x.shape[-1] - padding_right
-    return x[..., padding_left:end]
-def get_extra_padding_for_conv1d(
-    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
-) -> int:
-    """See `pad_for_conv1d`."""
-    length = x.shape[-1]
-    n_frames = (length - kernel_size + padding_total) / stride + 1
-    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
-    return ideal_length - length
-def pad1d(
-    x: torch.Tensor,
-    paddings: tuple[int, int],
-    mode: str = "zeros",
-    value: float = 0.0,
-):
-    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
-    If this is the case, we insert extra 0 padding to the right
-    before the reflection happen.
-    """
-    length = x.shape[-1]
-    padding_left, padding_right = paddings
-    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
-    if mode == "reflect":
-        max_pad = max(padding_left, padding_right)
-        extra_pad = 0
-        if length <= max_pad:
-            extra_pad = max_pad - length + 1
-            x = F.pad(x, (0, extra_pad))
-        padded = F.pad(x, paddings, mode, value)
-        end = padded.shape[-1] - extra_pad
-        return padded[..., :end]
-    else:
-        return F.pad(x, paddings, mode, value)
-class FishConvNet(nn.Module):
-    def __init__(
-        self, in_channels, out_channels, kernel_size, dilation=1, stride=1, groups=1
-    ):
-        super(FishConvNet, self).__init__()
-        self.conv = nn.Conv1d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            dilation=dilation,
-            groups=groups,
-        )
-        self.stride = stride
-        self.kernel_size = (kernel_size - 1) * dilation + 1
-        self.dilation = dilation
-    def forward(self, x):
-        pad = self.kernel_size - self.stride
-        extra_padding = get_extra_padding_for_conv1d(
-            x, self.kernel_size, self.stride, pad
-        )
-        x = pad1d(x, (pad, extra_padding), mode="constant", value=0)
-        return self.conv(x).contiguous()
-    def weight_norm(self, name="weight", dim=0):
-        self.conv = weight_norm(self.conv, name=name, dim=dim)
-        return self
-    def remove_weight_norm(self):
-        self.conv = remove_parametrizations(self.conv)
-        return self
-class FishTransConvNet(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, dilation=1, stride=1):
-        super(FishTransConvNet, self).__init__()
-        self.conv = nn.ConvTranspose1d(
-            in_channels, out_channels, kernel_size, stride=stride, dilation=dilation
-        )
-        self.stride = stride
-        self.kernel_size = kernel_size
-    def forward(self, x):
-        x = self.conv(x)
-        pad = self.kernel_size - self.stride
-        padding_right = math.ceil(pad)
-        padding_left = pad - padding_right
-        x = unpad1d(x, (padding_left, padding_right))
-        return x.contiguous()
-    def weight_norm(self, name="weight", dim=0):
-        self.conv = weight_norm(self.conv, name=name, dim=dim)
-        return self
-    def remove_weight_norm(self):
-        self.conv = remove_parametrizations(self.conv)
-        return self
-class ResBlock1(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super().__init__()
-        self.convs1 = nn.ModuleList(
-            [
-                FishConvNet(
-                    channels, channels, kernel_size, stride=1, dilation=dilation[0]
-                ).weight_norm(),
-                FishConvNet(
-                    channels, channels, kernel_size, stride=1, dilation=dilation[1]
-                ).weight_norm(),
-                FishConvNet(
-                    channels, channels, kernel_size, stride=1, dilation=dilation[2]
-                ).weight_norm(),
-            ]
-        )
-        self.convs1.apply(init_weights)
-        self.convs2 = nn.ModuleList(
-            [
-                FishConvNet(
-                    channels, channels, kernel_size, stride=1, dilation=dilation[0]
-                ).weight_norm(),
-                FishConvNet(
-                    channels, channels, kernel_size, stride=1, dilation=dilation[1]
-                ).weight_norm(),
-                FishConvNet(
-                    channels, channels, kernel_size, stride=1, dilation=dilation[2]
-                ).weight_norm(),
-            ]
-        )
-        self.convs2.apply(init_weights)
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.silu(x)
-            xt = c1(xt)
-            xt = F.silu(xt)
-            xt = c2(xt)
-            x = xt + x
-        return x
-    def remove_parametrizations(self):
-        for conv in self.convs1:
-            remove_parametrizations(conv, tensor_name="weight")
-        for conv in self.convs2:
-            remove_parametrizations(conv, tensor_name="weight")
-class ParallelBlock(nn.Module):
-    def __init__(
-        self,
-        channels: int,
-        kernel_sizes: tuple[int] = (3, 7, 11),
-        dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
-    ):
-        super().__init__()
-        assert len(kernel_sizes) == len(dilation_sizes)
-        self.blocks = nn.ModuleList()
-        for k, d in zip(kernel_sizes, dilation_sizes):
-            self.blocks.append(ResBlock1(channels, k, d))
-    def forward(self, x):
-        return torch.stack([block(x) for block in self.blocks], dim=0).mean(dim=0)
-    def remove_parametrizations(self):
-        for block in self.blocks:
-            block.remove_parametrizations()
-class HiFiGANGenerator(nn.Module):
-    def __init__(
-        self,
-        *,
-        hop_length: int = 512,
-        upsample_rates: tuple[int] = (8, 8, 2, 2, 2),
-        upsample_kernel_sizes: tuple[int] = (16, 16, 8, 2, 2),
-        resblock_kernel_sizes: tuple[int] = (3, 7, 11),
-        resblock_dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
-        num_mels: int = 128,
-        upsample_initial_channel: int = 512,
-        pre_conv_kernel_size: int = 7,
-        post_conv_kernel_size: int = 7,
-        post_activation: Callable = partial(nn.SiLU, inplace=True),
-    ):
-        super().__init__()
-        assert (
-            prod(upsample_rates) == hop_length
-        ), f"hop_length must be {prod(upsample_rates)}"
-        self.conv_pre = FishConvNet(
-            num_mels,
-            upsample_initial_channel,
-            pre_conv_kernel_size,
-            stride=1,
-        ).weight_norm()
-        self.num_upsamples = len(upsample_rates)
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.noise_convs = nn.ModuleList()
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups.append(
-                FishTransConvNet(
-                    upsample_initial_channel // (2**i),
-                    upsample_initial_channel // (2 ** (i + 1)),
-                    k,
-                    stride=u,
-                ).weight_norm()
-            )
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            self.resblocks.append(
-                ParallelBlock(ch, resblock_kernel_sizes, resblock_dilation_sizes)
-            )
-        self.activation_post = post_activation()
-        self.conv_post = FishConvNet(
-            ch, 1, post_conv_kernel_size, stride=1
-        ).weight_norm()
-        self.ups.apply(init_weights)
-        self.conv_post.apply(init_weights)
-    def forward(self, x):
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            x = F.silu(x, inplace=True)
-            x = self.ups[i](x)
-            if self.training and self.checkpointing:
-                x = checkpoint(
-                    self.resblocks[i],
-                    x,
-                    use_reentrant=False,
-                )
-            else:
-                x = self.resblocks[i](x)
-        x = self.activation_post(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-    def remove_parametrizations(self):
-        for up in self.ups:
-            remove_parametrizations(up, tensor_name="weight")
-        for block in self.resblocks:
-            block.remove_parametrizations()
-        remove_parametrizations(self.conv_pre, tensor_name="weight")
-        remove_parametrizations(self.conv_post, tensor_name="weight")
-# DropPath copied from timm library
-def drop_path(
-    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
-):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-    'survival rate' as the argument.
-    """  # noqa: E501
-    if drop_prob == 0.0 or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (
-        x.ndim - 1
-    )  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-    if keep_prob > 0.0 and scale_by_keep:
-        random_tensor.div_(keep_prob)
-    return x * random_tensor
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""  # noqa: E501
-    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-        self.scale_by_keep = scale_by_keep
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
-    def extra_repr(self):
-        return f"drop_prob={round(self.drop_prob,3):0.3f}"
-class LayerNorm(nn.Module):
-    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
-    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
-    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
-    with shape (batch_size, channels, height, width).
-    """  # noqa: E501
-    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(normalized_shape))
-        self.bias = nn.Parameter(torch.zeros(normalized_shape))
-        self.eps = eps
-        self.data_format = data_format
-        if self.data_format not in ["channels_last", "channels_first"]:
-            raise NotImplementedError
-        self.normalized_shape = (normalized_shape,)
-    def forward(self, x):
-        if self.data_format == "channels_last":
-            return F.layer_norm(
-                x, self.normalized_shape, self.weight, self.bias, self.eps
-            )
-        elif self.data_format == "channels_first":
-            u = x.mean(1, keepdim=True)
-            s = (x - u).pow(2).mean(1, keepdim=True)
-            x = (x - u) / torch.sqrt(s + self.eps)
-            x = self.weight[:, None] * x + self.bias[:, None]
-            return x
-# ConvNeXt Block copied from https://github.com/fishaudio/fish-diffusion/blob/main/fish_diffusion/modules/convnext.py
-class ConvNeXtBlock(nn.Module):
-    r"""ConvNeXt Block. There are two equivalent implementations:
-    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
-    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
-    We use (2) as we find it slightly faster in PyTorch
-    Args:
-        dim (int): Number of input channels.
-        drop_path (float): Stochastic depth rate. Default: 0.0
-        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
-        kernel_size (int): Kernel size for depthwise conv. Default: 7.
-        dilation (int): Dilation for depthwise conv. Default: 1.
-    """  # noqa: E501
-    def __init__(
-        self,
-        dim: int,
-        drop_path: float = 0.0,
-        layer_scale_init_value: float = 1e-6,
-        mlp_ratio: float = 4.0,
-        kernel_size: int = 7,
-        dilation: int = 1,
-    ):
-        super().__init__()
-        self.dwconv = FishConvNet(
-            dim,
-            dim,
-            kernel_size=kernel_size,
-            # padding=int(dilation * (kernel_size - 1) / 2),
-            groups=dim,
-        )  # depthwise conv
-        self.norm = LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(
-            dim, int(mlp_ratio * dim)
-        )  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.pwconv2 = nn.Linear(int(mlp_ratio * dim), dim)
-        self.gamma = (
-            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
-            if layer_scale_init_value > 0
-            else None
-        )
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-    def forward(self, x, apply_residual: bool = True):
-        input = x
-        x = self.dwconv(x)
-        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)
-        x = self.norm(x)
-        x = self.pwconv1(x)
-        x = self.act(x)
-        x = self.pwconv2(x)
-        if self.gamma is not None:
-            x = self.gamma * x
-        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)
-        x = self.drop_path(x)
-        if apply_residual:
-            x = input + x
-        return x
-class ConvNeXtEncoder(nn.Module):
-    def __init__(
-        self,
-        input_channels: int = 3,
-        depths: list[int] = [3, 3, 9, 3],
-        dims: list[int] = [96, 192, 384, 768],
-        drop_path_rate: float = 0.0,
-        layer_scale_init_value: float = 1e-6,
-        kernel_size: int = 7,
-    ):
-        super().__init__()
-        assert len(depths) == len(dims)
-        self.downsample_layers = nn.ModuleList()
-        stem = nn.Sequential(
-            FishConvNet(
-                input_channels,
-                dims[0],
-                kernel_size=7,
-                # padding=3,
-                # padding_mode="replicate",
-                # padding_mode="zeros",
-            ),
-            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
-        )
-        self.downsample_layers.append(stem)
-        for i in range(len(depths) - 1):
-            mid_layer = nn.Sequential(
-                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
-                nn.Conv1d(dims[i], dims[i + 1], kernel_size=1),
-            )
-            self.downsample_layers.append(mid_layer)
-        self.stages = nn.ModuleList()
-        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
-        cur = 0
-        for i in range(len(depths)):
-            stage = nn.Sequential(
-                *[
-                    ConvNeXtBlock(
-                        dim=dims[i],
-                        drop_path=dp_rates[cur + j],
-                        layer_scale_init_value=layer_scale_init_value,
-                        kernel_size=kernel_size,
-                    )
-                    for j in range(depths[i])
-                ]
-            )
-            self.stages.append(stage)
-            cur += depths[i]
-        self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first")
-        self.apply(self._init_weights)
-    def _init_weights(self, m):
-        if isinstance(m, (nn.Conv1d, nn.Linear)):
-            nn.init.trunc_normal_(m.weight, std=0.02)
-            nn.init.constant_(m.bias, 0)
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        for i in range(len(self.downsample_layers)):
-            x = self.downsample_layers[i](x)
-            x = self.stages[i](x)
-        return self.norm(x)
-class FireflyArchitecture(nn.Module):
-    def __init__(
-        self,
-        backbone: nn.Module,
-        head: nn.Module,
-        quantizer: nn.Module,
-        spec_transform: nn.Module,
-    ):
-        super().__init__()
-        self.backbone = backbone
-        self.head = head
-        self.quantizer = quantizer
-        self.spec_transform = spec_transform
-        self.downsample_factor = math.prod(self.quantizer.downsample_factor)
-    def forward(self, x: torch.Tensor, template=None, mask=None) -> torch.Tensor:
-        if self.spec_transform is not None:
-            x = self.spec_transform(x)
-        x = self.backbone(x)
-        if mask is not None:
-            x = x * mask
-        if self.quantizer is not None:
-            vq_result = self.quantizer(x)
-            x = vq_result.z
-            if mask is not None:
-                x = x * mask
-        x = self.head(x, template=template)
-        if x.ndim == 2:
-            x = x[:, None, :]
-        if self.vq is not None:
-            return x, vq_result
-        return x
-    def encode(self, audios, audio_lengths):
-        audios = audios.float()
-        mels = self.spec_transform(audios)
-        mel_lengths = audio_lengths // self.spec_transform.hop_length
-        mel_masks = sequence_mask(mel_lengths, mels.shape[2])
-        mel_masks_float_conv = mel_masks[:, None, :].float()
-        mels = mels * mel_masks_float_conv
-        # Encode
-        encoded_features = self.backbone(mels) * mel_masks_float_conv
-        feature_lengths = mel_lengths // self.downsample_factor
-        return self.quantizer.encode(encoded_features), feature_lengths
-    def decode(self, indices, feature_lengths) -> torch.Tensor:
-        mel_masks = sequence_mask(
-            feature_lengths * self.downsample_factor,
-            indices.shape[2] * self.downsample_factor,
-        )
-        mel_masks_float_conv = mel_masks[:, None, :].float()
-        audio_lengths = (
-            feature_lengths * self.downsample_factor * self.spec_transform.hop_length
-        )
-        audio_masks = sequence_mask(
-            audio_lengths,
-            indices.shape[2] * self.downsample_factor * self.spec_transform.hop_length,
-        )
-        audio_masks_float_conv = audio_masks[:, None, :].float()
-        z = self.quantizer.decode(indices) * mel_masks_float_conv
-        x = self.head(z) * audio_masks_float_conv
-        return x, audio_lengths
-    def remove_parametrizations(self):
-        if hasattr(self.backbone, "remove_parametrizations"):
-            self.backbone.remove_parametrizations()
-        if hasattr(self.head, "remove_parametrizations"):
-            self.head.remove_parametrizations()
-    @property
-    def device(self):
-        return next(self.parameters()).device

+import math
+from functools import partial
+from math import prod
+from typing import Callable
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.utils.parametrizations import weight_norm
+from torch.nn.utils.parametrize import remove_parametrizations
+from torch.utils.checkpoint import checkpoint
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv1D") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return (kernel_size * dilation - dilation) // 2
+def unpad1d(x: torch.Tensor, paddings: tuple[int, int]):
+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) <= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left:end]
+def get_extra_padding_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+) -> int:
+    """See `pad_for_conv1d`."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+def pad1d(
+    x: torch.Tensor,
+    paddings: tuple[int, int],
+    mode: str = "zeros",
+    value: float = 0.0,
+):
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right
+    before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+class FishConvNet(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size, dilation=1, stride=1, groups=1
+    ):
+        super(FishConvNet, self).__init__()
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            groups=groups,
+        )
+        self.stride = stride
+        self.kernel_size = (kernel_size - 1) * dilation + 1
+        self.dilation = dilation
+    def forward(self, x):
+        pad = self.kernel_size - self.stride
+        extra_padding = get_extra_padding_for_conv1d(
+            x, self.kernel_size, self.stride, pad
+        )
+        x = pad1d(x, (pad, extra_padding), mode="constant", value=0)
+        return self.conv(x).contiguous()
+    def weight_norm(self, name="weight", dim=0):
+        self.conv = weight_norm(self.conv, name=name, dim=dim)
+        return self
+    def remove_parametrizations(self, name="weight"):
+        self.conv = remove_parametrizations(self.conv, name)
+        return self
+class FishTransConvNet(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, dilation=1, stride=1):
+        super(FishTransConvNet, self).__init__()
+        self.conv = nn.ConvTranspose1d(
+            in_channels, out_channels, kernel_size, stride=stride, dilation=dilation
+        )
+        self.stride = stride
+        self.kernel_size = kernel_size
+    def forward(self, x):
+        x = self.conv(x)
+        pad = self.kernel_size - self.stride
+        padding_right = math.ceil(pad)
+        padding_left = pad - padding_right
+        x = unpad1d(x, (padding_left, padding_right))
+        return x.contiguous()
+    def weight_norm(self, name="weight", dim=0):
+        self.conv = weight_norm(self.conv, name=name, dim=dim)
+        return self
+    def remove_parametrizations(self, name="weight"):
+        self.conv = remove_parametrizations(self.conv, name)
+        return self
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super().__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                FishConvNet(
+                    channels, channels, kernel_size, stride=1, dilation=dilation[0]
+                ).weight_norm(),
+                FishConvNet(
+                    channels, channels, kernel_size, stride=1, dilation=dilation[1]
+                ).weight_norm(),
+                FishConvNet(
+                    channels, channels, kernel_size, stride=1, dilation=dilation[2]
+                ).weight_norm(),
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                FishConvNet(
+                    channels, channels, kernel_size, stride=1, dilation=dilation[0]
+                ).weight_norm(),
+                FishConvNet(
+                    channels, channels, kernel_size, stride=1, dilation=dilation[1]
+                ).weight_norm(),
+                FishConvNet(
+                    channels, channels, kernel_size, stride=1, dilation=dilation[2]
+                ).weight_norm(),
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.silu(x)
+            xt = c1(xt)
+            xt = F.silu(xt)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_parametrizations(self):
+        for conv in self.convs1:
+            conv.remove_parametrizations()
+        for conv in self.convs2:
+            conv.remove_parametrizations()
+class ParallelBlock(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        kernel_sizes: tuple[int] = (3, 7, 11),
+        dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
+    ):
+        super().__init__()
+        assert len(kernel_sizes) == len(dilation_sizes)
+        self.blocks = nn.ModuleList()
+        for k, d in zip(kernel_sizes, dilation_sizes):
+            self.blocks.append(ResBlock1(channels, k, d))
+    def forward(self, x):
+        return torch.stack([block(x) for block in self.blocks], dim=0).mean(dim=0)
+    def remove_parametrizations(self):
+        for block in self.blocks:
+            block.remove_parametrizations()
+class HiFiGANGenerator(nn.Module):
+    def __init__(
+        self,
+        *,
+        hop_length: int = 512,
+        upsample_rates: tuple[int] = (8, 8, 2, 2, 2),
+        upsample_kernel_sizes: tuple[int] = (16, 16, 8, 2, 2),
+        resblock_kernel_sizes: tuple[int] = (3, 7, 11),
+        resblock_dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
+        num_mels: int = 128,
+        upsample_initial_channel: int = 512,
+        pre_conv_kernel_size: int = 7,
+        post_conv_kernel_size: int = 7,
+        post_activation: Callable = partial(nn.SiLU, inplace=True),
+    ):
+        super().__init__()
+        assert (
+            prod(upsample_rates) == hop_length
+        ), f"hop_length must be {prod(upsample_rates)}"
+        self.conv_pre = FishConvNet(
+            num_mels,
+            upsample_initial_channel,
+            pre_conv_kernel_size,
+            stride=1,
+        ).weight_norm()
+        self.num_upsamples = len(upsample_rates)
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.noise_convs = nn.ModuleList()
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                FishTransConvNet(
+                    upsample_initial_channel // (2**i),
+                    upsample_initial_channel // (2 ** (i + 1)),
+                    k,
+                    stride=u,
+                ).weight_norm()
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            self.resblocks.append(
+                ParallelBlock(ch, resblock_kernel_sizes, resblock_dilation_sizes)
+            )
+        self.activation_post = post_activation()
+        self.conv_post = FishConvNet(
+            ch, 1, post_conv_kernel_size, stride=1
+        ).weight_norm()
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.silu(x, inplace=True)
+            x = self.ups[i](x)
+            if self.training and self.checkpointing:
+                x = checkpoint(
+                    self.resblocks[i],
+                    x,
+                    use_reentrant=False,
+                )
+            else:
+                x = self.resblocks[i](x)
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_parametrizations(self):
+        for up in self.ups:
+            up.remove_parametrizations()
+        for block in self.resblocks:
+            block.remove_parametrizations()
+        self.conv_pre.remove_parametrizations()
+        self.conv_post.remove_parametrizations()
+# DropPath copied from timm library
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """  # noqa: E501
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""  # noqa: E501
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
+class LayerNorm(nn.Module):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """  # noqa: E501
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape,)
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(
+                x, self.normalized_shape, self.weight, self.bias, self.eps
+            )
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None] * x + self.bias[:, None]
+            return x
+# ConvNeXt Block copied from https://github.com/fishaudio/fish-diffusion/blob/main/fish_diffusion/modules/convnext.py
+class ConvNeXtBlock(nn.Module):
+    r"""ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        kernel_size (int): Kernel size for depthwise conv. Default: 7.
+        dilation (int): Dilation for depthwise conv. Default: 1.
+    """  # noqa: E501
+    def __init__(
+        self,
+        dim: int,
+        drop_path: float = 0.0,
+        layer_scale_init_value: float = 1e-6,
+        mlp_ratio: float = 4.0,
+        kernel_size: int = 7,
+        dilation: int = 1,
+    ):
+        super().__init__()
+        self.dwconv = FishConvNet(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            # padding=int(dilation * (kernel_size - 1) / 2),
+            groups=dim,
+        )  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, int(mlp_ratio * dim)
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(int(mlp_ratio * dim), dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x, apply_residual: bool = True):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)
+        x = self.drop_path(x)
+        if apply_residual:
+            x = input + x
+        return x
+class ConvNeXtEncoder(nn.Module):
+    def __init__(
+        self,
+        input_channels: int = 3,
+        depths: list[int] = [3, 3, 9, 3],
+        dims: list[int] = [96, 192, 384, 768],
+        drop_path_rate: float = 0.0,
+        layer_scale_init_value: float = 1e-6,
+        kernel_size: int = 7,
+    ):
+        super().__init__()
+        assert len(depths) == len(dims)
+        self.downsample_layers = nn.ModuleList()
+        stem = nn.Sequential(
+            FishConvNet(
+                input_channels,
+                dims[0],
+                kernel_size=7,
+                # padding=3,
+                # padding_mode="replicate",
+                # padding_mode="zeros",
+            ),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
+        )
+        self.downsample_layers.append(stem)
+        for i in range(len(depths) - 1):
+            mid_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                nn.Conv1d(dims[i], dims[i + 1], kernel_size=1),
+            )
+            self.downsample_layers.append(mid_layer)
+        self.stages = nn.ModuleList()
+        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(len(depths)):
+            stage = nn.Sequential(
+                *[
+                    ConvNeXtBlock(
+                        dim=dims[i],
+                        drop_path=dp_rates[cur + j],
+                        layer_scale_init_value=layer_scale_init_value,
+                        kernel_size=kernel_size,
+                    )
+                    for j in range(depths[i])
+                ]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+        self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first")
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            nn.init.constant_(m.bias, 0)
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        for i in range(len(self.downsample_layers)):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+        return self.norm(x)
+class FireflyArchitecture(nn.Module):
+    def __init__(
+        self,
+        backbone: nn.Module,
+        head: nn.Module,
+        quantizer: nn.Module,
+        spec_transform: nn.Module,
+    ):
+        super().__init__()
+        self.backbone = backbone
+        self.head = head
+        self.quantizer = quantizer
+        self.spec_transform = spec_transform
+        self.downsample_factor = math.prod(self.quantizer.downsample_factor)
+    def forward(self, x: torch.Tensor, template=None, mask=None) -> torch.Tensor:
+        if self.spec_transform is not None:
+            x = self.spec_transform(x)
+        x = self.backbone(x)
+        if mask is not None:
+            x = x * mask
+        if self.quantizer is not None:
+            vq_result = self.quantizer(x)
+            x = vq_result.z
+            if mask is not None:
+                x = x * mask
+        x = self.head(x, template=template)
+        if x.ndim == 2:
+            x = x[:, None, :]
+        if self.vq is not None:
+            return x, vq_result
+        return x
+    def encode(self, audios, audio_lengths):
+        audios = audios.float()
+        mels = self.spec_transform(audios)
+        mel_lengths = audio_lengths // self.spec_transform.hop_length
+        mel_masks = sequence_mask(mel_lengths, mels.shape[2])
+        mel_masks_float_conv = mel_masks[:, None, :].float()
+        mels = mels * mel_masks_float_conv
+        # Encode
+        encoded_features = self.backbone(mels) * mel_masks_float_conv
+        feature_lengths = mel_lengths // self.downsample_factor
+        return self.quantizer.encode(encoded_features), feature_lengths
+    def decode(self, indices, feature_lengths) -> torch.Tensor:
+        mel_masks = sequence_mask(
+            feature_lengths * self.downsample_factor,
+            indices.shape[2] * self.downsample_factor,
+        )
+        mel_masks_float_conv = mel_masks[:, None, :].float()
+        audio_lengths = (
+            feature_lengths * self.downsample_factor * self.spec_transform.hop_length
+        )
+        audio_masks = sequence_mask(
+            audio_lengths,
+            indices.shape[2] * self.downsample_factor * self.spec_transform.hop_length,
+        )
+        audio_masks_float_conv = audio_masks[:, None, :].float()
+        z = self.quantizer.decode(indices) * mel_masks_float_conv
+        x = self.head(z) * audio_masks_float_conv
+        return x, audio_lengths
+    def remove_parametrizations(self):
+        if hasattr(self.backbone, "remove_parametrizations"):
+            self.backbone.remove_parametrizations()
+        if hasattr(self.head, "remove_parametrizations"):
+            self.head.remove_parametrizations()
+    @property
+    def device(self):
+        return next(self.parameters()).device

fish_speech/models/vqgan/modules/fsq.py CHANGED Viewed

@@ -1,116 +1,116 @@
-from dataclasses import dataclass
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from vector_quantize_pytorch import GroupedResidualFSQ
-from .firefly import ConvNeXtBlock, FishConvNet, FishTransConvNet
-@dataclass
-class FSQResult:
-    z: torch.Tensor
-    codes: torch.Tensor
-    latents: torch.Tensor
-class DownsampleFiniteScalarQuantize(nn.Module):
-    def __init__(
-        self,
-        input_dim: int = 512,
-        n_codebooks: int = 9,
-        n_groups: int = 1,
-        levels: tuple[int] = (8, 5, 5, 5),  # Approximate 2**10
-        downsample_factor: tuple[int] = (2, 2),
-        downsample_dims: tuple[int] | None = None,
-    ):
-        super().__init__()
-        if downsample_dims is None:
-            downsample_dims = [input_dim for _ in range(len(downsample_factor))]
-        all_dims = (input_dim,) + tuple(downsample_dims)
-        self.residual_fsq = GroupedResidualFSQ(
-            dim=all_dims[-1],
-            levels=levels,
-            num_quantizers=n_codebooks,
-            groups=n_groups,
-        )
-        self.downsample_factor = downsample_factor
-        self.downsample_dims = downsample_dims
-        self.downsample = nn.Sequential(
-            *[
-                nn.Sequential(
-                    FishConvNet(
-                        all_dims[idx],
-                        all_dims[idx + 1],
-                        kernel_size=factor,
-                        stride=factor,
-                    ),
-                    ConvNeXtBlock(dim=all_dims[idx + 1]),
-                )
-                for idx, factor in enumerate(downsample_factor)
-            ]
-        )
-        self.upsample = nn.Sequential(
-            *[
-                nn.Sequential(
-                    FishTransConvNet(
-                        all_dims[idx + 1],
-                        all_dims[idx],
-                        kernel_size=factor,
-                        stride=factor,
-                    ),
-                    ConvNeXtBlock(dim=all_dims[idx]),
-                )
-                for idx, factor in reversed(list(enumerate(downsample_factor)))
-            ]
-        )
-        self.apply(self._init_weights)
-    def _init_weights(self, m):
-        if isinstance(m, (nn.Conv1d, nn.Linear)):
-            nn.init.trunc_normal_(m.weight, std=0.02)
-            nn.init.constant_(m.bias, 0)
-    def forward(self, z) -> FSQResult:
-        original_shape = z.shape
-        z = self.downsample(z)
-        quantized, indices = self.residual_fsq(z.mT)
-        result = FSQResult(
-            z=quantized.mT,
-            codes=indices.mT,
-            latents=z,
-        )
-        result.z = self.upsample(result.z)
-        # Pad or crop z to match original shape
-        diff = original_shape[-1] - result.z.shape[-1]
-        left = diff // 2
-        right = diff - left
-        if diff > 0:
-            result.z = F.pad(result.z, (left, right))
-        elif diff < 0:
-            result.z = result.z[..., left:-right]
-        return result
-    def encode(self, z):
-        z = self.downsample(z)
-        _, indices = self.residual_fsq(z.mT)
-        indices = rearrange(indices, "g b l r -> b (g r) l")
-        return indices
-    def decode(self, indices: torch.Tensor):
-        indices = rearrange(indices, "b (g r) l -> g b l r", g=self.residual_fsq.groups)
-        z_q = self.residual_fsq.get_output_from_indices(indices)
-        z_q = self.upsample(z_q.mT)
-        return z_q

+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from vector_quantize_pytorch import GroupedResidualFSQ
+from .firefly import ConvNeXtBlock, FishConvNet, FishTransConvNet
+@dataclass
+class FSQResult:
+    z: torch.Tensor
+    codes: torch.Tensor
+    latents: torch.Tensor
+class DownsampleFiniteScalarQuantize(nn.Module):
+    def __init__(
+        self,
+        input_dim: int = 512,
+        n_codebooks: int = 9,
+        n_groups: int = 1,
+        levels: tuple[int] = (8, 5, 5, 5),  # Approximate 2**10
+        downsample_factor: tuple[int] = (2, 2),
+        downsample_dims: tuple[int] | None = None,
+    ):
+        super().__init__()
+        if downsample_dims is None:
+            downsample_dims = [input_dim for _ in range(len(downsample_factor))]
+        all_dims = (input_dim,) + tuple(downsample_dims)
+        self.residual_fsq = GroupedResidualFSQ(
+            dim=all_dims[-1],
+            levels=levels,
+            num_quantizers=n_codebooks,
+            groups=n_groups,
+        )
+        self.downsample_factor = downsample_factor
+        self.downsample_dims = downsample_dims
+        self.downsample = nn.Sequential(
+            *[
+                nn.Sequential(
+                    FishConvNet(
+                        all_dims[idx],
+                        all_dims[idx + 1],
+                        kernel_size=factor,
+                        stride=factor,
+                    ),
+                    ConvNeXtBlock(dim=all_dims[idx + 1]),
+                )
+                for idx, factor in enumerate(downsample_factor)
+            ]
+        )
+        self.upsample = nn.Sequential(
+            *[
+                nn.Sequential(
+                    FishTransConvNet(
+                        all_dims[idx + 1],
+                        all_dims[idx],
+                        kernel_size=factor,
+                        stride=factor,
+                    ),
+                    ConvNeXtBlock(dim=all_dims[idx]),
+                )
+                for idx, factor in reversed(list(enumerate(downsample_factor)))
+            ]
+        )
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            nn.init.constant_(m.bias, 0)
+    def forward(self, z) -> FSQResult:
+        original_shape = z.shape
+        z = self.downsample(z)
+        quantized, indices = self.residual_fsq(z.mT)
+        result = FSQResult(
+            z=quantized.mT,
+            codes=indices.mT,
+            latents=z,
+        )
+        result.z = self.upsample(result.z)
+        # Pad or crop z to match original shape
+        diff = original_shape[-1] - result.z.shape[-1]
+        left = diff // 2
+        right = diff - left
+        if diff > 0:
+            result.z = F.pad(result.z, (left, right))
+        elif diff < 0:
+            result.z = result.z[..., -left:right]
+        return result
+    def encode(self, z):
+        z = self.downsample(z)
+        _, indices = self.residual_fsq(z.mT)
+        indices = rearrange(indices, "g b l r -> b (g r) l")
+        return indices
+    def decode(self, indices: torch.Tensor):
+        indices = rearrange(indices, "b (g r) l -> g b l r", g=self.residual_fsq.groups)
+        z_q = self.residual_fsq.get_output_from_indices(indices)
+        z_q = self.upsample(z_q.mT)
+        return z_q

fish_speech/models/vqgan/modules/reference.py DELETED Viewed

@@ -1,113 +0,0 @@
-from typing import Optional
-import torch
-import torch.nn.functional as F
-from torch import nn
-from .wavenet import WaveNet
-class ReferenceEncoder(WaveNet):
-    def __init__(
-        self,
-        input_channels: Optional[int] = None,
-        output_channels: Optional[int] = None,
-        residual_channels: int = 512,
-        residual_layers: int = 20,
-        dilation_cycle: Optional[int] = 4,
-        num_heads: int = 8,
-        latent_len: int = 4,
-    ):
-        super().__init__(
-            input_channels=input_channels,
-            residual_channels=residual_channels,
-            residual_layers=residual_layers,
-            dilation_cycle=dilation_cycle,
-        )
-        self.head_dim = residual_channels // num_heads
-        self.num_heads = num_heads
-        self.latent_len = latent_len
-        self.latent = nn.Parameter(torch.zeros(1, self.latent_len, residual_channels))
-        self.q = nn.Linear(residual_channels, residual_channels, bias=True)
-        self.kv = nn.Linear(residual_channels, residual_channels * 2, bias=True)
-        self.q_norm = nn.LayerNorm(self.head_dim)
-        self.k_norm = nn.LayerNorm(self.head_dim)
-        self.proj = nn.Linear(residual_channels, residual_channels)
-        self.proj_drop = nn.Dropout(0.1)
-        self.norm = nn.LayerNorm(residual_channels)
-        self.mlp = nn.Sequential(
-            nn.Linear(residual_channels, residual_channels * 4),
-            nn.SiLU(),
-            nn.Linear(residual_channels * 4, residual_channels),
-        )
-        self.output_projection_attn = nn.Linear(residual_channels, output_channels)
-        torch.nn.init.trunc_normal_(self.latent, std=0.02)
-        self.apply(self.init_weights)
-    def init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            torch.nn.init.trunc_normal_(m.weight, std=0.02)
-            if m.bias is not None:
-                torch.nn.init.constant_(m.bias, 0)
-    def forward(self, x, attn_mask=None):
-        x = super().forward(x).mT
-        B, N, C = x.shape
-        # Calculate mask
-        if attn_mask is not None:
-            assert attn_mask.shape == (B, N) and attn_mask.dtype == torch.bool
-            attn_mask = attn_mask[:, None, None, :].expand(
-                B, self.num_heads, self.latent_len, N
-            )
-        q_latent = self.latent.expand(B, -1, -1)
-        q = (
-            self.q(q_latent)
-            .reshape(B, self.latent_len, self.num_heads, self.head_dim)
-            .transpose(1, 2)
-        )
-        kv = (
-            self.kv(x)
-            .reshape(B, N, 2, self.num_heads, self.head_dim)
-            .permute(2, 0, 3, 1, 4)
-        )
-        k, v = kv.unbind(0)
-        q, k = self.q_norm(q), self.k_norm(k)
-        x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
-        x = x.transpose(1, 2).reshape(B, self.latent_len, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        x = x + self.mlp(self.norm(x))
-        x = self.output_projection_attn(x)
-        x = x.mean(1)
-        return x
-if __name__ == "__main__":
-    with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
-        model = ReferenceEncoder(
-            input_channels=128,
-            output_channels=64,
-            residual_channels=384,
-            residual_layers=20,
-            dilation_cycle=4,
-            num_heads=8,
-        )
-        x = torch.randn(4, 128, 64)
-        mask = torch.ones(4, 64, dtype=torch.bool)
-        y = model(x, mask)
-        print(y.shape)
-        loss = F.mse_loss(y, torch.randn(4, 64))
-        loss.backward()

fish_speech/models/vqgan/modules/wavenet.py DELETED Viewed

@@ -1,225 +0,0 @@
-import math
-from typing import Optional
-import torch
-import torch.nn.functional as F
-from torch import nn
-class Mish(nn.Module):
-    def forward(self, x):
-        return x * torch.tanh(F.softplus(x))
-class DiffusionEmbedding(nn.Module):
-    """Diffusion Step Embedding"""
-    def __init__(self, d_denoiser):
-        super(DiffusionEmbedding, self).__init__()
-        self.dim = d_denoiser
-    def forward(self, x):
-        device = x.device
-        half_dim = self.dim // 2
-        emb = math.log(10000) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
-        emb = x[:, None] * emb[None, :]
-        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
-        return emb
-class LinearNorm(nn.Module):
-    """LinearNorm Projection"""
-    def __init__(self, in_features, out_features, bias=False):
-        super(LinearNorm, self).__init__()
-        self.linear = nn.Linear(in_features, out_features, bias)
-        nn.init.xavier_uniform_(self.linear.weight)
-        if bias:
-            nn.init.constant_(self.linear.bias, 0.0)
-    def forward(self, x):
-        x = self.linear(x)
-        return x
-class ConvNorm(nn.Module):
-    """1D Convolution"""
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size=1,
-        stride=1,
-        padding=None,
-        dilation=1,
-        bias=True,
-        w_init_gain="linear",
-    ):
-        super(ConvNorm, self).__init__()
-        if padding is None:
-            assert kernel_size % 2 == 1
-            padding = int(dilation * (kernel_size - 1) / 2)
-        self.conv = nn.Conv1d(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            bias=bias,
-        )
-        nn.init.kaiming_normal_(self.conv.weight)
-    def forward(self, signal):
-        conv_signal = self.conv(signal)
-        return conv_signal
-class ResidualBlock(nn.Module):
-    """Residual Block"""
-    def __init__(
-        self,
-        residual_channels,
-        use_linear_bias=False,
-        dilation=1,
-        condition_channels=None,
-    ):
-        super(ResidualBlock, self).__init__()
-        self.conv_layer = ConvNorm(
-            residual_channels,
-            2 * residual_channels,
-            kernel_size=3,
-            stride=1,
-            padding=dilation,
-            dilation=dilation,
-        )
-        if condition_channels is not None:
-            self.diffusion_projection = LinearNorm(
-                residual_channels, residual_channels, use_linear_bias
-            )
-            self.condition_projection = ConvNorm(
-                condition_channels, 2 * residual_channels, kernel_size=1
-            )
-        self.output_projection = ConvNorm(
-            residual_channels, 2 * residual_channels, kernel_size=1
-        )
-    def forward(self, x, condition=None, diffusion_step=None):
-        y = x
-        if diffusion_step is not None:
-            diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
-            y = y + diffusion_step
-        y = self.conv_layer(y)
-        if condition is not None:
-            condition = self.condition_projection(condition)
-            y = y + condition
-        gate, filter = torch.chunk(y, 2, dim=1)
-        y = torch.sigmoid(gate) * torch.tanh(filter)
-        y = self.output_projection(y)
-        residual, skip = torch.chunk(y, 2, dim=1)
-        return (x + residual) / math.sqrt(2.0), skip
-class WaveNet(nn.Module):
-    def __init__(
-        self,
-        input_channels: Optional[int] = None,
-        output_channels: Optional[int] = None,
-        residual_channels: int = 512,
-        residual_layers: int = 20,
-        dilation_cycle: Optional[int] = 4,
-        is_diffusion: bool = False,
-        condition_channels: Optional[int] = None,
-    ):
-        super().__init__()
-        # Input projection
-        self.input_projection = None
-        if input_channels is not None and input_channels != residual_channels:
-            self.input_projection = ConvNorm(
-                input_channels, residual_channels, kernel_size=1
-            )
-        if input_channels is None:
-            input_channels = residual_channels
-        self.input_channels = input_channels
-        # Residual layers
-        self.residual_layers = nn.ModuleList(
-            [
-                ResidualBlock(
-                    residual_channels=residual_channels,
-                    use_linear_bias=False,
-                    dilation=2 ** (i % dilation_cycle) if dilation_cycle else 1,
-                    condition_channels=condition_channels,
-                )
-                for i in range(residual_layers)
-            ]
-        )
-        # Skip projection
-        self.skip_projection = ConvNorm(
-            residual_channels, residual_channels, kernel_size=1
-        )
-        # Output projection
-        self.output_projection = None
-        if output_channels is not None and output_channels != residual_channels:
-            self.output_projection = ConvNorm(
-                residual_channels, output_channels, kernel_size=1
-            )
-        if is_diffusion:
-            self.diffusion_embedding = DiffusionEmbedding(residual_channels)
-            self.mlp = nn.Sequential(
-                LinearNorm(residual_channels, residual_channels * 4, False),
-                Mish(),
-                LinearNorm(residual_channels * 4, residual_channels, False),
-            )
-        self.apply(self._init_weights)
-    def _init_weights(self, m):
-        if isinstance(m, (nn.Conv1d, nn.Linear)):
-            nn.init.trunc_normal_(m.weight, std=0.02)
-            if getattr(m, "bias", None) is not None:
-                nn.init.constant_(m.bias, 0)
-    def forward(self, x, t=None, condition=None):
-        if self.input_projection is not None:
-            x = self.input_projection(x)
-            x = F.silu(x)
-        if t is not None:
-            t = self.diffusion_embedding(t)
-            t = self.mlp(t)
-        skip = []
-        for layer in self.residual_layers:
-            x, skip_connection = layer(x, condition, t)
-            skip.append(skip_connection)
-        x = torch.sum(torch.stack(skip), dim=0) / math.sqrt(len(self.residual_layers))
-        x = self.skip_projection(x)
-        if self.output_projection is not None:
-            x = F.silu(x)
-            x = self.output_projection(x)
-        return x

fish_speech/models/vqgan/spectrogram.py DELETED Viewed

@@ -1,122 +0,0 @@
-import torch
-import torchaudio.functional as F
-from torch import Tensor, nn
-from torchaudio.transforms import MelScale
-class LinearSpectrogram(nn.Module):
-    def __init__(
-        self,
-        n_fft=2048,
-        win_length=2048,
-        hop_length=512,
-        center=False,
-        mode="pow2_sqrt",
-    ):
-        super().__init__()
-        self.n_fft = n_fft
-        self.win_length = win_length
-        self.hop_length = hop_length
-        self.center = center
-        self.mode = mode
-        self.register_buffer("window", torch.hann_window(win_length), persistent=False)
-    def forward(self, y: Tensor) -> Tensor:
-        if y.ndim == 3:
-            y = y.squeeze(1)
-        y = torch.nn.functional.pad(
-            y.unsqueeze(1),
-            (
-                (self.win_length - self.hop_length) // 2,
-                (self.win_length - self.hop_length + 1) // 2,
-            ),
-            mode="reflect",
-        ).squeeze(1)
-        spec = torch.stft(
-            y,
-            self.n_fft,
-            hop_length=self.hop_length,
-            win_length=self.win_length,
-            window=self.window,
-            center=self.center,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True,
-        )
-        spec = torch.view_as_real(spec)
-        if self.mode == "pow2_sqrt":
-            spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
-        return spec
-class LogMelSpectrogram(nn.Module):
-    def __init__(
-        self,
-        sample_rate=44100,
-        n_fft=2048,
-        win_length=2048,
-        hop_length=512,
-        n_mels=128,
-        center=False,
-        f_min=0.0,
-        f_max=None,
-    ):
-        super().__init__()
-        self.sample_rate = sample_rate
-        self.n_fft = n_fft
-        self.win_length = win_length
-        self.hop_length = hop_length
-        self.center = center
-        self.n_mels = n_mels
-        self.f_min = f_min
-        self.f_max = f_max or float(sample_rate // 2)
-        self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center)
-        fb = F.melscale_fbanks(
-            n_freqs=self.n_fft // 2 + 1,
-            f_min=self.f_min,
-            f_max=self.f_max,
-            n_mels=self.n_mels,
-            sample_rate=self.sample_rate,
-            norm="slaney",
-            mel_scale="slaney",
-        )
-        self.register_buffer(
-            "fb",
-            fb,
-            persistent=False,
-        )
-    def compress(self, x: Tensor) -> Tensor:
-        return torch.log(torch.clamp(x, min=1e-5))
-    def decompress(self, x: Tensor) -> Tensor:
-        return torch.exp(x)
-    def apply_mel_scale(self, x: Tensor) -> Tensor:
-        return torch.matmul(x.transpose(-1, -2), self.fb).transpose(-1, -2)
-    def forward(
-        self, x: Tensor, return_linear: bool = False, sample_rate: int = None
-    ) -> Tensor:
-        if sample_rate is not None and sample_rate != self.sample_rate:
-            x = F.resample(x, orig_freq=sample_rate, new_freq=self.sample_rate)
-        linear = self.spectrogram(x)
-        x = self.apply_mel_scale(linear)
-        x = self.compress(x)
-        if return_linear:
-            return x, self.compress(linear)
-        return x

fish_speech/models/vqgan/utils.py CHANGED Viewed

@@ -1,94 +1,94 @@
-import matplotlib
-import torch
-from matplotlib import pyplot as plt
-matplotlib.use("Agg")
-def convert_pad_shape(pad_shape):
-    l = pad_shape[::-1]
-    pad_shape = [item for sublist in l for item in sublist]
-    return pad_shape
-def sequence_mask(length, max_length=None):
-    if max_length is None:
-        max_length = length.max()
-    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
-    return x.unsqueeze(0) < length.unsqueeze(1)
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        m.weight.data.normal_(mean, std)
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size * dilation - dilation) / 2)
-def plot_mel(data, titles=None):
-    fig, axes = plt.subplots(len(data), 1, squeeze=False)
-    if titles is None:
-        titles = [None for i in range(len(data))]
-    plt.tight_layout()
-    for i in range(len(data)):
-        mel = data[i]
-        if isinstance(mel, torch.Tensor):
-            mel = mel.float().detach().cpu().numpy()
-        axes[i][0].imshow(mel, origin="lower")
-        axes[i][0].set_aspect(2.5, adjustable="box")
-        axes[i][0].set_ylim(0, mel.shape[0])
-        axes[i][0].set_title(titles[i], fontsize="medium")
-        axes[i][0].tick_params(labelsize="x-small", left=False, labelleft=False)
-        axes[i][0].set_anchor("W")
-    return fig
-def slice_segments(x, ids_str, segment_size=4):
-    ret = torch.zeros_like(x[:, :, :segment_size])
-    for i in range(x.size(0)):
-        idx_str = ids_str[i]
-        idx_end = idx_str + segment_size
-        ret[i] = x[i, :, idx_str:idx_end]
-    return ret
-def rand_slice_segments(x, x_lengths=None, segment_size=4):
-    b, d, t = x.size()
-    if x_lengths is None:
-        x_lengths = t
-    ids_str_max = torch.clamp(x_lengths - segment_size + 1, min=0)
-    ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
-    ret = slice_segments(x, ids_str, segment_size)
-    return ret, ids_str
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(in_act, n_channels):
-    n_channels_int = n_channels[0]
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-def avg_with_mask(x, mask):
-    assert mask.dtype == torch.float, "Mask should be float"
-    if mask.ndim == 2:
-        mask = mask.unsqueeze(1)
-    if mask.shape[1] == 1:
-        mask = mask.expand_as(x)
-    return (x * mask).sum() / mask.sum()

+import matplotlib
+import torch
+from matplotlib import pyplot as plt
+matplotlib.use("Agg")
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def plot_mel(data, titles=None):
+    fig, axes = plt.subplots(len(data), 1, squeeze=False)
+    if titles is None:
+        titles = [None for i in range(len(data))]
+    plt.tight_layout()
+    for i in range(len(data)):
+        mel = data[i]
+        if isinstance(mel, torch.Tensor):
+            mel = mel.float().detach().cpu().numpy()
+        axes[i][0].imshow(mel, origin="lower")
+        axes[i][0].set_aspect(2.5, adjustable="box")
+        axes[i][0].set_ylim(0, mel.shape[0])
+        axes[i][0].set_title(titles[i], fontsize="medium")
+        axes[i][0].tick_params(labelsize="x-small", left=False, labelleft=False)
+        axes[i][0].set_anchor("W")
+    return fig
+def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = torch.clamp(x_lengths - segment_size + 1, min=0)
+    ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(in_act, n_channels):
+    n_channels_int = n_channels[0]
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+def avg_with_mask(x, mask):
+    assert mask.dtype == torch.float, "Mask should be float"
+    if mask.ndim == 2:
+        mask = mask.unsqueeze(1)
+    if mask.shape[1] == 1:
+        mask = mask.expand_as(x)
+    return (x * mask).sum() / mask.sum()

fish_speech/scheduler.py CHANGED Viewed

@@ -1,40 +1,40 @@
-import math
-def get_cosine_schedule_with_warmup_lr_lambda(
-    current_step: int,
-    *,
-    num_warmup_steps: int | float,
-    num_training_steps: int,
-    num_cycles: float = 0.5,
-    final_lr_ratio: float = 0.0,
-):
-    if 0 < num_warmup_steps < 1:  # float mode
-        num_warmup_steps = int(num_warmup_steps * num_training_steps)
-    if current_step < num_warmup_steps:
-        return float(current_step) / float(max(1, num_warmup_steps))
-    progress = float(current_step - num_warmup_steps) / float(
-        max(1, num_training_steps - num_warmup_steps)
-    )
-    return max(
-        final_lr_ratio,
-        0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)),
-    )
-def get_constant_schedule_with_warmup_lr_lambda(
-    current_step: int,
-    *,
-    num_warmup_steps: int | float,
-    num_training_steps: int | None = None,
-):
-    if 0 < num_warmup_steps < 1:  # float mode
-        num_warmup_steps = int(num_warmup_steps * num_training_steps)
-    if current_step < num_warmup_steps:
-        return float(current_step) / float(max(1, num_warmup_steps))
-    return 1.0

+import math
+def get_cosine_schedule_with_warmup_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int | float,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+    final_lr_ratio: float = 0.0,
+):
+    if 0 < num_warmup_steps < 1:  # float mode
+        num_warmup_steps = int(num_warmup_steps * num_training_steps)
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(
+        max(1, num_training_steps - num_warmup_steps)
+    )
+    return max(
+        final_lr_ratio,
+        0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)),
+    )
+def get_constant_schedule_with_warmup_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int | float,
+    num_training_steps: int | None = None,
+):
+    if 0 < num_warmup_steps < 1:  # float mode
+        num_warmup_steps = int(num_warmup_steps * num_training_steps)
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    return 1.0

fish_speech/text/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .clean import clean_text
-from .spliter import split_text
-__all__ = ["clean_text", "split_text"]

+from .clean import clean_text
+from .spliter import split_text
+__all__ = ["clean_text", "split_text"]

fish_speech/text/chn_text_norm/.gitignore CHANGED Viewed

@@ -1,114 +1,114 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# pyenv
-.python-version
-# celery beat schedule file
-celerybeat-schedule
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-# JetBrains PyCharm
-.idea
-# Customize
-references
-url.txt
-# Git
-.git

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+# JetBrains PyCharm
+.idea
+# Customize
+references
+url.txt
+# Git
+.git

fish_speech/text/chn_text_norm/README.md CHANGED Viewed

@@ -1,36 +1,36 @@
-# This account is no longer in use, see [Atomicoo](https://github.com/atomicoo) for my latest works.
-# Chn Text Norm
-this is a repository for chinese text normalization (no longer maintained).
-## Quick Start ##
-### Git Clone Repo ###
-git clone this repo to the root directory of your project which need to use it.
-    cd /path/to/proj
-    git clone https://github.com/Joee1995/chn-text-norm.git
-after that, your doc tree should be:
-```
-proj                     # root of your project
-|--- chn_text_norm       # this chn-text-norm tool
-     |--- text.py
-     |--- ...
-|--- text_normalize.py   # your text normalization code
-|--- ...
-```
-### How to Use ? ###
-    # text_normalize.py
-    from chn_text_norm.text import *
-    raw_text = 'your raw text'
-    text = Text(raw_text=raw_text).normalize()
-### How to add quantums ###
-打开test.py，然后你就知道怎么做了。

+# This account is no longer in use, see [Atomicoo](https://github.com/atomicoo) for my latest works.
+# Chn Text Norm
+this is a repository for chinese text normalization (no longer maintained).
+## Quick Start ##
+### Git Clone Repo ###
+git clone this repo to the root directory of your project which need to use it.
+    cd /path/to/proj
+    git clone https://github.com/Joee1995/chn-text-norm.git
+after that, your doc tree should be:
+```
+proj                     # root of your project
+|--- chn_text_norm       # this chn-text-norm tool
+     |--- text.py
+     |--- ...
+|--- text_normalize.py   # your text normalization code
+|--- ...
+```
+### How to Use ? ###
+    # text_normalize.py
+    from chn_text_norm.text import *
+    raw_text = 'your raw text'
+    text = Text(raw_text=raw_text).normalize()
+### How to add quantums ###
+打开test.py，然后你就知道怎么做了。

fish_speech/text/chn_text_norm/basic_class.py CHANGED Viewed

@@ -1,172 +1,172 @@
-# -*- coding: utf-8 -*-
-"""基本类
-中文字符类
-中文数字/数位类
-中文数字类
-中文数位类
-中文数字系统类
-中文数学符号类
-*中文其他符号类
-"""
-__author__ = "Zhiyang Zhou <[email protected]>"
-__data__ = "2019-05-02"
-from fish_speech.text.chn_text_norm.basic_constant import NUMBERING_TYPES
-class ChineseChar(object):
-    """
-    中文字符
-    每个字符对应简体和繁体,
-    e.g. 简体 = '负', 繁体 = '負'
-    转换时可转换为简体或繁体
-    """
-    def __init__(self, simplified, traditional):
-        self.simplified = simplified
-        self.traditional = traditional
-        self.__repr__ = self.__str__
-    def __str__(self):
-        return self.simplified or self.traditional or None
-    def __repr__(self):
-        return self.__str__()
-class ChineseNumberUnit(ChineseChar):
-    """
-    中文数字/数位字符
-    每个字符除繁简体外还有一个额外的大写字符
-    e.g. '陆' 和 '陸'
-    """
-    def __init__(self, power, simplified, traditional, big_s, big_t):
-        super(ChineseNumberUnit, self).__init__(simplified, traditional)
-        self.power = power
-        self.big_s = big_s
-        self.big_t = big_t
-    def __str__(self):
-        return "10^{}".format(self.power)
-    @classmethod
-    def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
-        if small_unit:
-            return ChineseNumberUnit(
-                power=index + 1,
-                simplified=value[0],
-                traditional=value[1],
-                big_s=value[1],
-                big_t=value[1],
-            )
-        elif numbering_type == NUMBERING_TYPES[0]:
-            return ChineseNumberUnit(
-                power=index + 8,
-                simplified=value[0],
-                traditional=value[1],
-                big_s=value[0],
-                big_t=value[1],
-            )
-        elif numbering_type == NUMBERING_TYPES[1]:
-            return ChineseNumberUnit(
-                power=(index + 2) * 4,
-                simplified=value[0],
-                traditional=value[1],
-                big_s=value[0],
-                big_t=value[1],
-            )
-        elif numbering_type == NUMBERING_TYPES[2]:
-            return ChineseNumberUnit(
-                power=pow(2, index + 3),
-                simplified=value[0],
-                traditional=value[1],
-                big_s=value[0],
-                big_t=value[1],
-            )
-        else:
-            raise ValueError(
-                "Counting type should be in {0} ({1} provided).".format(
-                    NUMBERING_TYPES, numbering_type
-                )
-            )
-class ChineseNumberDigit(ChineseChar):
-    """
-    中文数字字符
-    """
-    def __init__(
-        self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None
-    ):
-        super(ChineseNumberDigit, self).__init__(simplified, traditional)
-        self.value = value
-        self.big_s = big_s
-        self.big_t = big_t
-        self.alt_s = alt_s
-        self.alt_t = alt_t
-    def __str__(self):
-        return str(self.value)
-    @classmethod
-    def create(cls, i, v):
-        return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
-class ChineseMath(ChineseChar):
-    """
-    中文数位字符
-    """
-    def __init__(self, simplified, traditional, symbol, expression=None):
-        super(ChineseMath, self).__init__(simplified, traditional)
-        self.symbol = symbol
-        self.expression = expression
-        self.big_s = simplified
-        self.big_t = traditional
-CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
-class NumberSystem(object):
-    """
-    中文数字系统
-    """
-    pass
-class MathSymbol(object):
-    """
-    用于中文数字系统的数学符号 (繁/简体), e.g.
-    positive = ['正', '正']
-    negative = ['负', '負']
-    point = ['点', '點']
-    """
-    def __init__(self, positive, negative, point):
-        self.positive = positive
-        self.negative = negative
-        self.point = point
-    def __iter__(self):
-        for v in self.__dict__.values():
-            yield v
-# class OtherSymbol(object):
-#     """
-#     其他符号
-#     """
-#
-#     def __init__(self, sil):
-#         self.sil = sil
-#
-#     def __iter__(self):
-#         for v in self.__dict__.values():
-#             yield v

+# -*- coding: utf-8 -*-
+"""基本类
+中文字符类
+中文数字/数位类
+中文数字类
+中文数位类
+中文数字系统类
+中文数学符号类
+*中文其他符号类
+"""
+__author__ = "Zhiyang Zhou <[email protected]>"
+__data__ = "2019-05-02"
+from fish_speech.text.chn_text_norm.basic_constant import NUMBERING_TYPES
+class ChineseChar(object):
+    """
+    中文字符
+    每个字符对应简体和繁体,
+    e.g. 简体 = '负', 繁体 = '負'
+    转换时可转换为简���或繁体
+    """
+    def __init__(self, simplified, traditional):
+        self.simplified = simplified
+        self.traditional = traditional
+        self.__repr__ = self.__str__
+    def __str__(self):
+        return self.simplified or self.traditional or None
+    def __repr__(self):
+        return self.__str__()
+class ChineseNumberUnit(ChineseChar):
+    """
+    中文数字/数位字符
+    每个字符除繁简体外还有一个额外的大写字符
+    e.g. '陆' 和 '陸'
+    """
+    def __init__(self, power, simplified, traditional, big_s, big_t):
+        super(ChineseNumberUnit, self).__init__(simplified, traditional)
+        self.power = power
+        self.big_s = big_s
+        self.big_t = big_t
+    def __str__(self):
+        return "10^{}".format(self.power)
+    @classmethod
+    def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
+        if small_unit:
+            return ChineseNumberUnit(
+                power=index + 1,
+                simplified=value[0],
+                traditional=value[1],
+                big_s=value[1],
+                big_t=value[1],
+            )
+        elif numbering_type == NUMBERING_TYPES[0]:
+            return ChineseNumberUnit(
+                power=index + 8,
+                simplified=value[0],
+                traditional=value[1],
+                big_s=value[0],
+                big_t=value[1],
+            )
+        elif numbering_type == NUMBERING_TYPES[1]:
+            return ChineseNumberUnit(
+                power=(index + 2) * 4,
+                simplified=value[0],
+                traditional=value[1],
+                big_s=value[0],
+                big_t=value[1],
+            )
+        elif numbering_type == NUMBERING_TYPES[2]:
+            return ChineseNumberUnit(
+                power=pow(2, index + 3),
+                simplified=value[0],
+                traditional=value[1],
+                big_s=value[0],
+                big_t=value[1],
+            )
+        else:
+            raise ValueError(
+                "Counting type should be in {0} ({1} provided).".format(
+                    NUMBERING_TYPES, numbering_type
+                )
+            )
+class ChineseNumberDigit(ChineseChar):
+    """
+    中文数字字符
+    """
+    def __init__(
+        self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None
+    ):
+        super(ChineseNumberDigit, self).__init__(simplified, traditional)
+        self.value = value
+        self.big_s = big_s
+        self.big_t = big_t
+        self.alt_s = alt_s
+        self.alt_t = alt_t
+    def __str__(self):
+        return str(self.value)
+    @classmethod
+    def create(cls, i, v):
+        return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
+class ChineseMath(ChineseChar):
+    """
+    中文数位字符
+    """
+    def __init__(self, simplified, traditional, symbol, expression=None):
+        super(ChineseMath, self).__init__(simplified, traditional)
+        self.symbol = symbol
+        self.expression = expression
+        self.big_s = simplified
+        self.big_t = traditional
+CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
+class NumberSystem(object):
+    """
+    中文数字系统
+    """
+    pass
+class MathSymbol(object):
+    """
+    用于中文数字系统的数学符号 (繁/简体), e.g.
+    positive = ['正', '正']
+    negative = ['负', '負']
+    point = ['点', '點']
+    """
+    def __init__(self, positive, negative, point):
+        self.positive = positive
+        self.negative = negative
+        self.point = point
+    def __iter__(self):
+        for v in self.__dict__.values():
+            yield v
+# class OtherSymbol(object):
+#     """
+#     其他符号
+#     """
+#
+#     def __init__(self, sil):
+#         self.sil = sil
+#
+#     def __iter__(self):
+#         for v in self.__dict__.values():
+#             yield v