Spaces:

XXXXRT
/

GPT-SoVITS

Running on Zero

App Files Files

XXXXRT666 commited on Sep 16

Commit

d4d21ad

1 Parent(s): d0754c2

Init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +37 -0
.gitignore +196 -0
.pre-commit-config.yaml +15 -0
GPT_SoVITS/Accelerate/MLX/__init__.py +12 -0
GPT_SoVITS/Accelerate/MLX/backends/mlx_quantized.py +181 -0
GPT_SoVITS/Accelerate/MLX/backends/mlx_static.py +99 -0
GPT_SoVITS/Accelerate/MLX/backends/mlx_varlen.py +103 -0
GPT_SoVITS/Accelerate/MLX/sample_funcs_mlx.py +65 -0
GPT_SoVITS/Accelerate/MLX/structs_mlx.py +152 -0
GPT_SoVITS/Accelerate/MLX/t2s_engine_mlx.py +238 -0
GPT_SoVITS/Accelerate/MLX/t2s_model_abc.py +530 -0
GPT_SoVITS/Accelerate/PyTorch/__init__.py +30 -0
GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py +158 -0
GPT_SoVITS/Accelerate/PyTorch/backends/mps_flash_attn_varlen.py +166 -0
GPT_SoVITS/Accelerate/PyTorch/backends/sage_attn_varlen_cuda_graph.py +175 -0
GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py +166 -0
GPT_SoVITS/Accelerate/PyTorch/backends/torch_varlen.py +145 -0
GPT_SoVITS/Accelerate/PyTorch/export.py +467 -0
GPT_SoVITS/Accelerate/PyTorch/nn.py +69 -0
GPT_SoVITS/Accelerate/PyTorch/sample_funcs.py +67 -0
GPT_SoVITS/Accelerate/PyTorch/structs.py +151 -0
GPT_SoVITS/Accelerate/PyTorch/t2s_engine.py +223 -0
GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py +672 -0
GPT_SoVITS/Accelerate/__init__.py +30 -0
GPT_SoVITS/Accelerate/logger.py +203 -0
GPT_SoVITS/configs/.gitignore +1 -0
GPT_SoVITS/configs/s2.json +91 -0
GPT_SoVITS/configs/s2v2Pro.json +91 -0
GPT_SoVITS/configs/s2v2ProPlus.json +91 -0
GPT_SoVITS/eres2net/ERes2NetV2.py +252 -0
GPT_SoVITS/eres2net/fusion.py +27 -0
GPT_SoVITS/eres2net/kaldi.py +844 -0
GPT_SoVITS/eres2net/pooling_layers.py +101 -0
GPT_SoVITS/f5_tts/model/__init__.py +3 -0
GPT_SoVITS/f5_tts/model/backbones/README.md +20 -0
GPT_SoVITS/f5_tts/model/backbones/dit.py +193 -0
GPT_SoVITS/f5_tts/model/backbones/mmdit.py +144 -0
GPT_SoVITS/f5_tts/model/backbones/unett.py +218 -0
GPT_SoVITS/f5_tts/model/modules.py +665 -0
GPT_SoVITS/feature_extractor/__init__.py +3 -0
GPT_SoVITS/feature_extractor/cnhubert.py +46 -0
GPT_SoVITS/inference_webui.py +1104 -0
GPT_SoVITS/module/attentions.py +658 -0
GPT_SoVITS/module/attentions_onnx.py +385 -0
GPT_SoVITS/module/commons.py +185 -0
GPT_SoVITS/module/core_vq.py +365 -0
GPT_SoVITS/module/data_utils.py +1073 -0
GPT_SoVITS/module/losses.py +70 -0
GPT_SoVITS/module/mel_processing.py +142 -0
GPT_SoVITS/module/models.py +1411 -0

.gitattributes CHANGED Viewed

	@@ -1 +1,38 @@





































1	GPT_SoVITS/text/ja_userdic/userdict.csv filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+GPT_SoVITS/text/G2PWModel/* filter=lfs diff=lfs merge=lfs -text
+GPT_SoVITS/text/G2PWModel/** filter=lfs diff=lfs merge=lfs -text
 GPT_SoVITS/text/ja_userdic/userdict.csv filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,196 @@

+.DS_Store
+.vscode
+__pycache__
+*.pyc
+env
+runtime
+.idea
+output
+logs
+SoVITS_weights*/
+GPT_weights*/
+TEMP
+weight.json
+ffmpeg*
+ffprobe*
+cfg.json
+speakers.json
+ref_audios
+tools/AP_BWE/24kto48k/*
+!tools/AP_BWE/24kto48k/readme.txt
+onnx_export
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+ci:
+  autoupdate_schedule: monthly
+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.11.7
+  hooks:
+    # Run the linter.
+    - id: ruff
+      types_or: [ python, pyi ]
+      args: [ --fix , "--exit-zero" ]
+    # Run the formatter.
+    - id: ruff-format
+      types_or: [ python, pyi ]
+      args: [ --line-length, "120", --target-version, "py310" ]

GPT_SoVITS/Accelerate/MLX/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import importlib.util
+import platform
+if importlib.util.find_spec("mlx") is not None and platform.system() == "Darwin":
+    from .sample_funcs_mlx import sample_naive as sample_naive_mlx
+    from .t2s_engine_mlx import T2SEngine as T2SEngineMLX
+    backends = ["mlx_static", "mlx_quantized_mxfp4", "mlx_quantized_affine", "mlx_varlen"]
+else:
+    backends = []
+__all__ = ["T2SEngineMLX", "sample_naive_mlx", "backends"]

GPT_SoVITS/Accelerate/MLX/backends/mlx_quantized.py ADDED Viewed

	@@ -0,0 +1,181 @@

+from __future__ import annotations
+from typing import cast
+import mlx.core as mx
+import mlx.nn as nn
+from ..structs_mlx import KVCacheQ
+from ..t2s_model_abc import (
+    AttentionABC,
+    KVCache,
+    KVCacheHND,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+Array = mx.array
+class Attention(AttentionABC):
+    def __init__(self, n_head: int, hidden_dim: int, max_seq_length: int):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+        self.kc_class = KVCacheHND
+    @staticmethod
+    def quantized_scaled_dot_product_attention(
+        queries: Array,
+        q_keys: tuple[Array, Array, Array],
+        q_values: tuple[Array, Array, Array],
+        scale: float,
+        mask: Array,
+        group_size: int = 32,
+        bits: int = 8,
+    ) -> Array:
+        queries *= scale
+        scores = mx.quantized_matmul(queries, *q_keys, transpose=True, group_size=group_size, bits=bits)
+        scores = mx.where(mask, scores, -mx.inf)
+        scores = mx.softmax(scores, axis=-1, precise=True)  # type: ignore
+        out = mx.quantized_matmul(scores, *q_values, transpose=False, group_size=group_size, bits=bits)
+        return out
+    def __call__(self, x: Array, input_pos: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array, attn_mask: Array):
+        bsz, seqlen, _ = cast(tuple[int, ...], x.shape)
+        q, k, v = self.in_proj(x).split(3, axis=-1)
+        q, k, v = map(lambda x: x.reshape(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+        q, k, v = map(lambda x: x.swapaxes(1, 2), (q, k, v))
+        kv_cache = self.kc_class.update_cache(input_pos, k, v, kv_cache, cache_idx)
+        assert len(kv_cache) == 2
+        max_idx = int(input_pos.max())
+        q, k, v = map(lambda x: x[..., :max_idx, :], (q, *kv_cache))
+        mask = attn_mask[..., :max_idx]
+        attn = mx.fast.scaled_dot_product_attention(q, k, v, scale=self.scale, mask=mask)
+        attn = attn.swapaxes(1, 2).reshape(bsz, seqlen, self.hidden_dim)
+        attn = self.out_proj(attn)
+        return attn
+    # def __call__(self, x: Array, input_pos: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array, attn_mask: Array):
+    #     bsz, seqlen, _ = cast(tuple[int, ...], x.shape)
+    #     q, k, v = self.in_proj(x).split(3, axis=-1)
+    #     q, k, v = map(lambda x: x.reshape(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+    #     q, k, v = map(lambda x: x.swapaxes(1, 2), (q, k, v))
+    #     kv_cache = self.kc_class.update_cache(input_pos, k, v, kv_cache, cache_idx)
+    #     assert len(kv_cache) == 3
+    #     (k_q, k_s, k_b), (v_q, v_s, v_b), (group_size, bits) = kv_cache
+    #     k_q, k_s, k_b, v_q, v_s, v_b = map(lambda x: x[..., : int(input_pos.max()), :], (k_q, k_s, k_b, v_q, v_s, v_b))
+    #     mask = attn_mask[..., : int(input_pos.max())]
+    #     attn = Attention.quantized_scaled_dot_product_attention(
+    #         q,
+    #         (k_q, k_s, k_b),
+    #         (v_q, v_s, v_b),
+    #         self.scale,
+    #         mask,
+    #         group_size,
+    #         bits,
+    #     )
+    #     attn = attn.swapaxes(1, 2).reshape(bsz, seqlen, self.hidden_dim)
+    #     output = self.out_proj(attn)
+    #     return output
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int, *args, **kwds) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length, *args, **kwds)
+        self.attention = Attention(n_head, hidden_dim, max_seq_length, *args, **kwds)
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim: int,
+        n_layer: int,
+        n_head: int,
+        ffn_dim: int,
+        vocab_size: int,
+        max_seq_length: int,
+        max_batch_size: int,
+        *args,
+        **kwds,
+    ) -> None:
+        super().__init__(
+            hidden_dim,
+            n_layer,
+            n_head,
+            ffn_dim,
+            vocab_size,
+            max_seq_length,
+            max_batch_size,
+            *args,
+            **kwds,
+        )
+        self.layers = [
+            TransformerBlock(
+                n_head,
+                ffn_dim,
+                hidden_dim,
+                max_seq_length,
+                *args,
+                **kwds,
+            )
+            for _ in range(n_layer)
+        ]
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config: dict,
+        max_seq_length: int = 2000,
+        max_batch_size: int = 10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+        self.h = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+        self.kv_class = KVCacheHND
+        self.group_size = 32
+        self.bits = 8
+        self.mode = "affine"
+    def set_mode(self, mode: str):
+        assert mode in ["affine", "mxfp4"]
+        self.mode = mode
+        if self.mode == "mxfp4":
+            self.bits = 4
+        else:
+            self.bits = 8
+    def quantized(self):
+        nn.quantize(self, self.group_size, self.bits, mode=self.mode)
+        # for layer in self.h.layers:
+        #     nn.quantize(layer.feed_forward, self.group_size, self.bits)
+        #     nn.quantize(layer.attention, self.group_size, self.bits)

GPT_SoVITS/Accelerate/MLX/backends/mlx_static.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from __future__ import annotations
+from typing import cast
+import mlx.core as mx
+from ..structs_mlx import KVCache, KVCacheQ
+from ..t2s_model_abc import (
+    AttentionABC,
+    KVCacheHND,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+Array = mx.array
+class Attention(AttentionABC):
+    def __init__(self, n_head: int, hidden_dim: int, max_seq_length: int):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+        self.kc_class = KVCacheHND
+    def __call__(self, x: Array, input_pos: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array, attn_mask: Array):
+        bsz, seqlen, _ = cast(tuple[int, ...], x.shape)
+        q, k, v = self.in_proj(x).split(3, axis=-1)
+        q, k, v = map(lambda x: x.reshape(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+        q, k, v = map(lambda x: x.swapaxes(1, 2), (q, k, v))
+        kv_cache = self.kc_class.update_cache(input_pos, k, v, kv_cache, cache_idx)
+        assert len(kv_cache) == 2
+        k, v = kv_cache
+        attn = mx.fast.scaled_dot_product_attention(q, k, v, scale=self.scale, mask=attn_mask)
+        attn = attn.swapaxes(1, 2).reshape(bsz, seqlen, self.hidden_dim)
+        attn = self.out_proj(attn)
+        return attn
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim: int,
+        n_layer: int,
+        n_head: int,
+        ffn_dim: int,
+        vocab_size: int,
+        max_seq_length: int,
+        max_batch_size: int,
+    ) -> None:
+        super().__init__(
+            hidden_dim,
+            n_layer,
+            n_head,
+            ffn_dim,
+            vocab_size,
+            max_seq_length,
+            max_batch_size,
+        )
+        self.layers = [
+            TransformerBlock(
+                n_head,
+                ffn_dim,
+                hidden_dim,
+                max_seq_length,
+            )
+            for _ in range(n_layer)
+        ]
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config: dict,
+        max_seq_length: int = 2000,
+        max_batch_size: int = 10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+        self.h = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+        self.kv_class = KVCacheHND

GPT_SoVITS/Accelerate/MLX/backends/mlx_varlen.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from __future__ import annotations
+from typing import cast
+import mlx.core as mx
+from ..structs_mlx import KVCache, KVCacheQ
+from ..t2s_model_abc import (
+    AttentionABC,
+    KVCacheHND,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+Array = mx.array
+class Attention(AttentionABC):
+    def __init__(self, n_head: int, hidden_dim: int, max_seq_length: int):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+        self.kc_class = KVCacheHND
+    def __call__(self, x: Array, input_pos: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array, attn_mask: Array):
+        bsz, seqlen, _ = cast(tuple[int, ...], x.shape)
+        q, k, v = self.in_proj(x).split(3, axis=-1)
+        q, k, v = map(lambda x: x.reshape(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+        q, k, v = map(lambda x: x.swapaxes(1, 2), (q, k, v))
+        kv_cache = self.kc_class.update_cache(input_pos, k, v, kv_cache, cache_idx)
+        assert len(kv_cache) == 2
+        max_idx = int(input_pos.max())
+        q, k, v = map(lambda x: x[..., :max_idx, :], (q, *kv_cache))
+        mask = attn_mask[..., :max_idx]
+        attn = mx.fast.scaled_dot_product_attention(q, k, v, scale=self.scale, mask=mask)
+        attn = attn.swapaxes(1, 2).reshape(bsz, seqlen, self.hidden_dim)
+        attn = self.out_proj(attn)
+        return attn
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim: int,
+        n_layer: int,
+        n_head: int,
+        ffn_dim: int,
+        vocab_size: int,
+        max_seq_length: int,
+        max_batch_size: int,
+    ) -> None:
+        super().__init__(
+            hidden_dim,
+            n_layer,
+            n_head,
+            ffn_dim,
+            vocab_size,
+            max_seq_length,
+            max_batch_size,
+        )
+        self.layers = [
+            TransformerBlock(
+                n_head,
+                ffn_dim,
+                hidden_dim,
+                max_seq_length,
+            )
+            for _ in range(n_layer)
+        ]
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config: dict,
+        max_seq_length: int = 2000,
+        max_batch_size: int = 10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+        self.h = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+        self.kv_class = KVCacheHND

GPT_SoVITS/Accelerate/MLX/sample_funcs_mlx.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from typing import Protocol, cast
+import mlx.core as mx
+Array = mx.array
+class SampleProtocolMLX(Protocol):
+    @staticmethod
+    def __call__(
+        logits: Array,
+        previous_tokens: Array,
+        temperature: float,
+        top_k: int,
+        top_p: float,
+        repetition_penalty: float,
+    ) -> Array: ...
+class sample_naive(SampleProtocolMLX):
+    # @partial(mx.compile)
+    @staticmethod
+    def __call__(
+        logits,
+        previous_tokens,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+    ):
+        if temperature <= 1e-5:
+            probs = mx.softmax(logits, axis=-1)
+            return mx.argmax(probs, axis=-1, keepdims=True).astype(mx.int32)
+        if repetition_penalty != 1.0:
+            batch_idx = mx.arange(cast(tuple[int, ...], previous_tokens.shape)[0])
+            previous_tokens = previous_tokens.astype(mx.int64)
+            selected_logists = logits[batch_idx, previous_tokens]
+            selected_logists = mx.where(
+                selected_logists < 0, selected_logists * repetition_penalty, selected_logists / repetition_penalty
+            )
+            logits[batch_idx, previous_tokens] = selected_logists
+        if top_p < 1.0:
+            sorted_indices = mx.argsort(-logits, axis=-1)
+            sorted_logits = mx.take_along_axis(logits, sorted_indices, axis=-1)
+            cum_probs = mx.cumsum(mx.softmax(sorted_logits, axis=-1), axis=-1)
+            sorted_indices_to_remove = cum_probs > top_p
+            sorted_indices_to_remove[:, -1] = False
+            indices_to_remove = mx.zeros_like(logits).astype(mx.bool_)
+            batch_indices = mx.arange(cast(tuple[int, ...], logits.shape)[0])[:, None]
+            indices_to_remove[batch_indices, sorted_indices] = sorted_indices_to_remove
+            logits = mx.where(indices_to_remove, -mx.inf, logits)
+        if temperature < 1.0:
+            logits = logits / temperature
+        v = mx.topk(logits, top_k)
+        pivot = mx.expand_dims(v[:, 0], -1)
+        logits = mx.where(logits < pivot, -mx.inf, logits)
+        gumbel_noise = mx.random.gumbel(shape=cast(tuple[int, ...], logits.shape), dtype=logits.dtype)
+        idx_next = mx.argmax(logits + gumbel_noise, axis=-1, keepdims=True).astype(mx.int32)
+        return idx_next

GPT_SoVITS/Accelerate/MLX/structs_mlx.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""
+Modified From https://github.com/XXXXRT666/GPT-SoVITS
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, MutableSequence, Protocol, TypeAlias, cast
+import mlx.core as mx
+import torch
+from ..PyTorch.structs import T2SRequest
+from .sample_funcs_mlx import SampleProtocolMLX, sample_naive
+Tensor = torch.Tensor
+Array = mx.array
+@dataclass(slots=True)
+class T2SRequestMLX:
+    x: List[Array]
+    x_lens: Array
+    prompts: Array
+    bert_feature: List[Array]
+    valid_length: int
+    top_k: int = 5
+    top_p: float = 1
+    early_stop_num: int = -1
+    temperature: float = 1.0
+    repetition_penalty: float = 1.35
+    @classmethod
+    def from_torch(cls, request: T2SRequest) -> T2SRequestMLX:
+        x = list(map(lambda tensor: mx.array(tensor.cpu()), request.x))
+        x_lens = mx.array(request.x_lens.cpu())
+        prompts = mx.array(request.prompts.cpu())
+        bert_feature = list(map(lambda tensor: mx.array(tensor.cpu()), request.bert_feature))
+        return cls(
+            x,
+            x_lens,
+            prompts,
+            bert_feature,
+            request.valid_length,
+            request.top_k,
+            request.top_p,
+            request.early_stop_num,
+            request.temperature,
+            request.repetition_penalty,
+        )
+KVCache: TypeAlias = tuple[Array, Array]
+KVCacheQ: TypeAlias = tuple[tuple[Array, Array, Array], tuple[Array, Array, Array], tuple[int, int]]
+class KVCacheProtocol(Protocol):
+    @staticmethod
+    def empty(kv_cache: KVCache | KVCacheQ) -> None: ...
+    @staticmethod
+    def update_cache(
+        input_pos: Array, k_val: Array, v_val: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array
+    ) -> KVCache | KVCacheQ: ...
+    @staticmethod
+    def prefill_kv(k_val: Array, v_val: Array, kv_cache: KVCache | KVCacheQ) -> None: ...
+    @staticmethod
+    def init_cache(
+        batch_size: int, max_seq_length: int, n_heads: int, head_dim: int, dtype: mx.Dtype, *args, **kwds
+    ) -> KVCache | KVCacheQ: ...
+class T2SDecoderProtocol(Protocol):
+    max_seq_length: int
+    EOS: int
+    n_head: int
+    def embed(self, x: list[Array], y: Array, bert_features: list[Array]) -> Array: ...
+class T2SSessionMLX:
+    def __init__(
+        self,
+        decoder: T2SDecoderProtocol,
+        request_torch: T2SRequest,
+        sample_func: type[SampleProtocolMLX] = sample_naive,
+        device: mx.Device = mx.Device(mx.cpu),
+        dtype: mx.Dtype = mx.float32,
+    ):
+        with mx.stream(device):
+            request = T2SRequestMLX.from_torch(request_torch)
+            self.decoder = decoder
+            self.request = request
+            self.device = device
+            self.dtype = dtype
+            bsz = len(request.x)
+            y_len: int = cast(tuple[int, ...], request.prompts.shape)[-1]
+            self.bsz = bsz
+            self.y_len = y_len
+            # Cache
+            self.kv_cache: MutableSequence[KVCache | KVCacheQ]
+            self.sample = sample_func()
+            # Forward args
+            self.x = [i.astype(mx.int32) for i in request.x]
+            self.x_lens = request.x_lens.astype(mx.int32)
+            self.y = mx.zeros((bsz, decoder.max_seq_length)).astype(mx.int32)
+            self.y[:, : cast(tuple[int, ...], request.prompts.shape)[-1]] = request.prompts.astype(mx.int32)
+            self.bert_feature = [i.astype(dtype) for i in request.bert_feature]
+            self.prefill_len = self.x_lens + cast(tuple[int, ...], request.prompts.shape)[1]
+            self.input_pos = mx.zeros_like(self.prefill_len)
+            self.input_pos += self.prefill_len
+            # EOS
+            self.completed = mx.array([False] * len(self.x)).astype(mx.bool_)
+            self.y_results: List[Array] = [None] * len(self.x)  # type: ignore
+            self.xy_pos = decoder.embed(self.x, request.prompts, self.bert_feature)
+            max_len = int(self.prefill_len.max(-1))
+            attn_mask = mx.zeros(shape=(bsz, max_len, max_len), dtype=mx.bool_)
+            for bs in range(bsz):
+                pos = int(self.x_lens[bs])
+                seq_len = pos + y_len
+                attn_mask[bs, :seq_len, :pos] = True
+                ar_mask = ~mx.triu(
+                    x=mx.ones(
+                        shape=(
+                            y_len,
+                            y_len,
+                        ),
+                        dtype=mx.bool_,
+                    ),
+                    k=1,
+                )
+                attn_mask[bs, pos:seq_len, pos:seq_len] = ar_mask
+            attn_mask = mx.repeat(mx.expand_dims(attn_mask, 1), decoder.n_head, 1)
+            self.attn_mask = attn_mask
+            mx.eval(self.attn_mask)

GPT_SoVITS/Accelerate/MLX/t2s_engine_mlx.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import gc
+import os
+import time
+import traceback
+from typing import cast
+import mlx.core as mx
+import torch
+from rich.progress import BarColumn, Progress, TextColumn
+from ..logger import SpeedColumnToken, console, logger
+from ..PyTorch.structs import T2SEngineProtocol, T2SRequest, T2SResult
+from .backends import mlx_quantized, mlx_static, mlx_varlen
+from .structs_mlx import T2SSessionMLX
+from .t2s_model_abc import T2SDecoderABC
+Array = mx.array
+Tensor = torch.Tensor
+class T2SEngine(T2SEngineProtocol):
+    def __init__(
+        self,
+        decoder_model: T2SDecoderABC,
+        device: mx.Device | str = mx.Device(mx.cpu),
+        dtype: torch.dtype | mx.Dtype = torch.float32,
+    ) -> None:
+        if isinstance(device, str):
+            match device:
+                case "mx.cpu":
+                    device = mx.Device(mx.cpu)
+                case "mx.gpu":
+                    device = mx.Device(mx.gpu)
+        match dtype:
+            case torch.float32:
+                dtype = mx.float32
+            case torch.float16:
+                dtype = mx.float16
+            case torch.bfloat16:
+                dtype = mx.bfloat16
+        device = cast(mx.Device, device)
+        dtype = cast(mx.Dtype, dtype)
+        assert device.type.value in {0, 1}
+        assert dtype in {mx.float16, mx.bfloat16, mx.float32}
+        self.device = device
+        self.dtype = dtype
+        mx.set_default_device(device)
+        decoder_model.set_dtype(self.dtype)
+        self.decoder_model: T2SDecoderABC = decoder_model
+        self.decoder_model.compile()
+    def _handle_request(self, request: T2SRequest):
+        decoder = self.decoder_model
+        session = T2SSessionMLX(decoder, request, device=self.device, dtype=self.dtype)
+        batch_idx = mx.arange(session.bsz)
+        t1 = 0.0
+        infer_speed = 0.0
+        infer_time = 0.0
+        with (
+            mx.stream(session.device),
+            Progress(
+                TextColumn("[cyan]{task.description}"),
+                BarColumn(),
+                TextColumn("{task.completed}/{task.total}"),
+                SpeedColumnToken(show_speed=True),
+                console=console,
+                transient=True,
+            ) as progress,
+        ):
+            max_token = min(2000 - int(session.input_pos.max()), 1500)
+            task = progress.add_task("T2S Decoding", total=max_token)
+            for idx in range(1500):
+                progress.update(task, advance=1)
+                if idx == 0:
+                    session.kv_cache = decoder.init_cache(session.bsz)
+                    xy_dec = decoder.h.prefill(
+                        session.xy_pos,
+                        session.attn_mask,
+                        session.kv_cache,
+                    )  # bs, seq_len, embed_dim
+                    xy_dec = xy_dec[None, batch_idx, session.input_pos - 1]
+                else:
+                    args, kwds = decoder.pre_forward(session)
+                    xy_dec = decoder.h(
+                        session.input_pos,
+                        session.xy_pos,
+                        session.kv_cache,
+                        batch_idx,
+                        *args,
+                        **kwds,
+                    )
+                decoder.post_forward(idx, session)
+                logits = decoder.ar_predict_layer(xy_dec[:, -1])
+                session.input_pos += 1
+                if idx == 0:
+                    logits[:, -1] = -mx.inf
+                samples = session.sample(
+                    logits=logits,
+                    previous_tokens=session.y[:, : session.y_len + idx],
+                    top_k=request.top_k,
+                    top_p=request.top_p,
+                    repetition_penalty=request.repetition_penalty,
+                    temperature=request.temperature,
+                )
+                session.y[batch_idx, session.y_len + idx] = samples
+                argmax_token = mx.argmax(logits, axis=-1)
+                sample_token = samples.squeeze(1)
+                EOS_mask = (cast(Array, argmax_token == decoder.EOS)) | (sample_token == decoder.EOS)
+                newly_done_mask = EOS_mask & (~session.completed)
+                newly_done_indices = mx.where(newly_done_mask, batch_idx, -1)
+                pos = mx.where(newly_done_indices != -1, batch_idx, session.bsz)
+                pos_sorted = mx.sort(pos, axis=0)
+                valid_count = session.bsz - mx.sum(cast(Array, pos_sorted == session.bsz))
+                pos_final = pos_sorted[: int(valid_count)]
+                newly_done_indices = mx.expand_dims(newly_done_indices[pos_final], 0)
+                if newly_done_indices.size > 0:
+                    for i in newly_done_indices:
+                        session.y_results[int(i)] = session.y[i, session.y_len : session.y_len + idx]
+                        session.completed[newly_done_indices] = True
+                if mx.all(session.completed).item():
+                    if session.y[:, session.y_len :].sum() == 0:
+                        session.y_results = [mx.array([0]) for _ in range(session.bsz)]
+                        logger.error("Bad Zero Prediction")
+                    else:
+                        logger.info(
+                            f"T2S Decoding EOS {session.prefill_len.tolist().__str__().strip('[]')} -> {[cast(tuple[int, ...], i.shape)[-1] for i in session.y_results].__str__().strip('[]')}"
+                        )
+                        logger.info(f"Infer Speed: {(idx - 1) / (time.perf_counter() - t1):.2f} token/s")
+                        infer_time = time.perf_counter() - t1
+                        infer_speed = (idx - 1) / infer_time
+                    break
+                if (request.early_stop_num != -1 and idx >= request.early_stop_num) or idx == max_token - 1:
+                    for j in range(session.bsz):
+                        if not session.completed[j].item():
+                            session.y_results[j] = session.y[[j], session.y_len : session.y_len + 1499]
+                            session.completed[j] = True
+                    logger.error("Bad Full Prediction")
+                    logger.info(f"Infer Speed: {(idx - 1) / (time.perf_counter() - t1):.2f} token/s")
+                    infer_time = time.perf_counter() - t1
+                    infer_speed = (idx - 1) / infer_time
+                    break
+                y_emb = decoder.ar_audio_embedding(samples)
+                session.xy_pos = decoder.ar_audio_position(session.input_pos - session.x_lens, y_emb)
+                mx.eval(session.xy_pos, session.y)
+                if idx == 1:
+                    t1 = time.perf_counter()
+                if idx % 100 == 0:
+                    mx.clear_cache()
+        match session.device:
+            case mx.gpu:
+                mx.clear_cache()
+            case mx.cpu:
+                gc.collect()
+        result_mlx = session.y_results[: request.valid_length]
+        mx.eval(result_mlx)
+        result = [torch.tensor(k) for k in result_mlx]
+        return result, infer_speed, infer_time
+    def generate(self, request: T2SRequest):
+        try:
+            result, infer_speed, infer_time = self._handle_request(request)
+            t2s_result = T2SResult(result=result, infer_speed=(infer_speed, infer_time), status="Success")
+        except Exception as e:
+            t2s_result = T2SResult(status="Error", exception=e, traceback=traceback.format_exc())
+        return t2s_result
+    @staticmethod
+    def replace_key(state_dict: dict[str, Tensor]):
+        state_dict_mlx: list[tuple[str, Array]] = []
+        for key, value in state_dict.items():
+            key = (
+                key.replace("model.", "")
+                .replace("in_proj_", "in_proj.")
+                .replace("self_attn", "attention")
+                .replace("linear", "feed_forward.linear")
+                .replace("norm1", "attention_norm")
+                .replace("norm2", "ffn_norm")
+            )
+            value_mlx = mx.array(value)
+            state_dict_mlx.append((key, value_mlx))
+        return state_dict_mlx
+    @staticmethod
+    def load_decoder(weights_path: os.PathLike, max_batch_size: int = 1, backend: str = "MLX-Varlen"):
+        logger.info(f"Loading Text2Semantic Weights from {weights_path} with {backend} Backend")
+        dict_s1 = torch.load(weights_path, map_location="cpu", weights_only=False, mmap=True)
+        config = dict_s1["config"]
+        match backend:
+            case "MLX-Varlen":
+                decoder_cls: type[T2SDecoderABC] = mlx_varlen.T2SDecoder
+            case "MLX-Static":
+                decoder_cls = mlx_static.T2SDecoder
+            case "MLX-Quantized-Affine" | "MLX-Quantized-MXFP4":
+                decoder_cls = mlx_quantized.T2SDecoder
+            case _:
+                raise RuntimeError(f"Backend {backend} Not Found")
+        decoder: T2SDecoderABC = decoder_cls(config, max_batch_size=max_batch_size)
+        state_dict = dict_s1["weight"]
+        state_dict_mlx = T2SEngine.replace_key(state_dict)
+        decoder.load_weights(state_dict_mlx)
+        decoder.eval()
+        mx.eval(decoder)
+        if "Quantized" in backend and isinstance(decoder, mlx_quantized.T2SDecoder):
+            if backend == "MLX-Quantized-Affine":
+                decoder.set_mode("affine")
+            elif backend == "MLX-Quantized-MXFP4":
+                decoder.set_mode("mxfp4")
+            else:
+                raise RuntimeError(f"Quantized Backend {backend} Not Supported")
+            decoder.quantized()
+            mx.eval(decoder)
+        return decoder

GPT_SoVITS/Accelerate/MLX/t2s_model_abc.py ADDED Viewed

	@@ -0,0 +1,530 @@

+from __future__ import annotations
+import math
+from abc import ABC, abstractmethod
+from typing import MutableSequence, cast
+import mlx.core as mx
+import mlx.nn as nn
+from .structs_mlx import KVCache, KVCacheProtocol, KVCacheQ, T2SDecoderProtocol, T2SSessionMLX
+Array = mx.array
+class TokenEmbedding(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        vocab_size: int,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
+    @property
+    def weight(self):
+        return self.word_embeddings.weight
+    def embedding(self, index: int):
+        return self.word_embeddings.weight[index : index + 1]
+    def __call__(self, x: Array):
+        x = self.word_embeddings(x)
+        return x
+class SinePositionalEmbedding(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        scale: bool = False,
+        max_batch_size: int = 10,
+        max_seq_len: int = 2000,
+    ):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
+        self.alpha = mx.ones(1)
+        self.max_batch_size = max_batch_size
+        self.max_seq_len = max_seq_len
+        self.reverse = False
+        self._pe = mx.zeros((max_batch_size, max_seq_len, embedding_dim))
+        self.compute_pe()
+    def compute_pe(self):
+        """Reset the positional encodings."""
+        if self.reverse:
+            position = mx.expand_dims(mx.arange(self.max_seq_len - 1, -1, -1.0), axis=1)
+        else:
+            position = mx.expand_dims(mx.arange(self.max_seq_len), axis=1)
+        div_term = mx.exp(
+            mx.arange(
+                0,
+                self.embedding_dim,
+                2,
+            )
+            * -(math.log(10000.0) / self.embedding_dim)
+        )
+        pe = self._pe
+        pe[:, :, 0::2] = mx.sin(position * div_term)
+        pe[:, :, 1::2] = mx.cos(position * div_term)
+    def __call__(self, input_pos: Array, x: Array):
+        """
+        Args:
+            input_pos (Array): [batch_size, ]
+            x (Array): [batch_size, 1, embed_dim]
+        Returns:
+            embedded_x (Array): [batch_size, 1, embed_dim]
+        """
+        batch_size = cast(tuple[int, ...], x.shape)[0]
+        pe_values = self._pe[mx.arange(batch_size), input_pos - 1]  # (batch_size, embed_dim)
+        return x * self.x_scale + self.alpha * mx.expand_dims(pe_values, 1)  # (batch_size, 1, embed_dim)
+    def prefill(self, x: Array):
+        """
+        Args:
+            x (Array): [batch_size, seq_len, embed_dim]
+        Returns:
+            embedded_x (Array): [batch_size, seq_len, embed_dim]
+        """
+        pe_values = self._pe[:, : cast(tuple[int, ...], x.shape)[-2]]
+        return x * self.x_scale + self.alpha * pe_values
+class KVCacheHND(KVCacheProtocol):
+    @staticmethod
+    def empty(kv_cache):
+        assert len(kv_cache) == 2
+        k_cache, v_cache = kv_cache
+        k_cache[:] = 0
+        v_cache[:] = 0
+    @staticmethod
+    def update_cache(input_pos, k_val, v_val, kv_cache, cache_idx):
+        # input_pos: [B, ], k_val: [B, H, 1, D]
+        assert len(kv_cache) == 2
+        k_out, v_out = kv_cache
+        ip0 = input_pos - 1
+        k_out[cache_idx, :, ip0, None] = k_val
+        v_out[cache_idx, :, ip0, None] = v_val
+        return k_out, v_out
+    @staticmethod
+    def prefill_kv(k_val, v_val, kv_cache):
+        # k_val: [B, S, H, D]
+        assert len(kv_cache) == 2
+        k_cache, v_cache = kv_cache
+        k_cache[..., : cast(tuple[int, ...], k_val.shape)[1], :] = k_val.swapaxes(1, 2)
+        v_cache[..., : cast(tuple[int, ...], v_val.shape)[1], :] = v_val.swapaxes(1, 2)
+    @staticmethod
+    def init_cache(batch_size: int, max_seq_length: int, n_heads: int, head_dim: int, dtype: mx.Dtype) -> KVCache:
+        cache_shape = (batch_size, n_heads, max_seq_length, head_dim)
+        return (mx.zeros(cache_shape, dtype=dtype), mx.zeros(cache_shape, dtype=dtype))
+class KVCacheHNDQuantized(KVCacheProtocol):
+    @staticmethod
+    def _el_per_int(bits: int) -> int:
+        return 32 // bits
+    @staticmethod
+    def _packed_dim(head_dim: int, bits: int = 8) -> int:
+        el_per_int = KVCacheHNDQuantized._el_per_int(bits)
+        if head_dim % el_per_int != 0:
+            raise ValueError(f"{head_dim=} is not divisible by {el_per_int=} ({bits=})")
+        return head_dim // el_per_int
+    @staticmethod
+    def _group_count(head_dim: int, group_size: int = 32) -> int:
+        assert group_size in {32, 64, 128}
+        if head_dim % group_size != 0:
+            raise ValueError(f"{head_dim} is not divisible by {group_size=}")
+        return head_dim // group_size
+    @staticmethod
+    def empty(kv_cache) -> None:
+        assert len(kv_cache) == 3
+        (k_q, k_s, k_b), (v_q, v_s, v_b), (_, __) = kv_cache
+        k_q[:] = 0
+        k_s[:] = 0
+        k_b[:] = 0
+        v_q[:] = 0
+        v_s[:] = 0
+        v_b[:] = 0
+    @staticmethod
+    def update_cache(
+        input_pos,
+        k_val,
+        v_val,
+        kv_cache,
+        cache_idx,
+    ):
+        # input_pos: [B, ], k_val: [B, H, 1, D]
+        assert len(kv_cache) == 3
+        (k_q_out, k_s_out, k_b_out), (v_q_out, v_s_out, v_b_out), (group_size, bits) = kv_cache
+        k_q, k_s, k_b = mx.quantize(k_val, group_size=group_size, bits=bits)
+        v_q, v_s, v_b = mx.quantize(v_val, group_size=group_size, bits=bits)
+        ip0 = input_pos - 1
+        k_q_out[cache_idx, :, ip0, None] = k_q
+        k_s_out[cache_idx, :, ip0, None] = k_s
+        k_b_out[cache_idx, :, ip0, None] = k_b
+        v_q_out[cache_idx, :, ip0, None] = v_q
+        v_s_out[cache_idx, :, ip0, None] = v_s
+        v_b_out[cache_idx, :, ip0, None] = v_b
+        return (k_q_out, k_s_out, k_b_out), (v_q_out, v_s_out, v_b_out), (group_size, bits)
+    @staticmethod
+    def prefill_kv(
+        k_val,
+        v_val,
+        kv_cache,
+    ) -> None:
+        assert len(kv_cache) == 3
+        (k_q_out, k_s_out, k_b_out), (v_q_out, v_s_out, v_b_out), (group_size, bits) = kv_cache
+        S = cast(tuple[int, ...], k_val.shape)[1]
+        k_sw = k_val.swapaxes(1, 2)
+        v_sw = v_val.swapaxes(1, 2)
+        k_q, k_s, k_b = mx.quantize(k_sw, group_size=group_size, bits=bits)
+        v_q, v_s, v_b = mx.quantize(v_sw, group_size=group_size, bits=bits)
+        k_q_out[..., :S, :] = k_q
+        k_s_out[..., :S, :] = k_s
+        k_b_out[..., :S, :] = k_b
+        v_q_out[..., :S, :] = v_q
+        v_s_out[..., :S, :] = v_s
+        v_b_out[..., :S, :] = v_b
+    @staticmethod
+    def init_cache(
+        batch_size: int,
+        max_seq_length: int,
+        n_heads: int,
+        head_dim: int,
+        dtype: mx.Dtype,
+        *,
+        group_size: int = 32,
+        bits: int = 8,
+    ) -> KVCacheQ:
+        packed_dim = KVCacheHNDQuantized._packed_dim(head_dim, bits=bits)
+        group_cnt = KVCacheHNDQuantized._group_count(head_dim, group_size=group_size)
+        packed_shape = (batch_size, n_heads, max_seq_length, packed_dim)
+        group_shape = (batch_size, n_heads, max_seq_length, group_cnt)
+        k_q = mx.zeros(packed_shape, dtype=mx.uint32)
+        k_s = mx.zeros(group_shape, dtype=dtype)
+        k_b = mx.zeros(group_shape, dtype=dtype)
+        v_q = mx.zeros(packed_shape, dtype=mx.uint32)
+        v_s = mx.zeros(group_shape, dtype=dtype)
+        v_b = mx.zeros(group_shape, dtype=dtype)
+        return (k_q, k_s, k_b), (v_q, v_s, v_b), (group_size, bits)
+class AttentionABC(ABC, nn.Module):
+    def __init__(self, n_head: int, hidden_dim: int, max_seq_length: int, *args, **kwds):
+        super().__init__()
+        self.n_head = n_head
+        self.hidden_dim = hidden_dim
+        assert hidden_dim % n_head == 0
+        self.head_dim = hidden_dim // n_head
+        self.max_seq_length = max_seq_length
+        # key, query, value projections for all heads, but in a batch
+        self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
+        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
+        self.scale = 1 / math.sqrt(self.head_dim)
+        self.kc_class: KVCacheProtocol
+    @abstractmethod
+    def __call__(
+        self, x: Array, input_pos: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array, attn_mask: Array
+    ) -> Array: ...
+    def prefill(self, x: Array, kv_cache: KVCache | KVCacheQ, attn_mask: Array):
+        bsz, seqlen, _ = cast(tuple[int, ...], x.shape)
+        q, k, v = self.in_proj(x).split(3, axis=-1)
+        q, k, v = map(lambda x: x.reshape(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+        self.kc_class.prefill_kv(k, v, kv_cache)
+        q, k, v = map(lambda x: x.swapaxes(1, 2), (q, k, v))
+        attn = mx.fast.scaled_dot_product_attention(q, k, v, mask=attn_mask, scale=self.scale)
+        attn = mx.nan_to_num(attn)
+        attn = attn.swapaxes(1, 2).reshape(1, -1, self.hidden_dim)
+        output = self.out_proj(attn)
+        return output
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int) -> None:
+        super().__init__()
+        self.linear1 = nn.Linear(dim, hidden_dim, bias=True)
+        self.linear2 = nn.Linear(hidden_dim, dim, bias=True)
+    def __call__(self, x: Array):
+        return self.linear2(nn.relu(self.linear1(x)))
+class TransformerBlockABC(nn.Module):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int, *args, **kwds) -> None:
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.max_seq_length = max_seq_length
+        self.attention: AttentionABC
+        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
+        self.attention_norm = nn.LayerNorm(self.hidden_dim)
+        self.ffn_norm = nn.LayerNorm(self.hidden_dim)
+    def __call__(self, x: Array, input_pos: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array, attn_mask: Array):
+        h = self.attention_norm(
+            x
+            + self.attention(
+                x,
+                input_pos,
+                kv_cache,
+                cache_idx,
+                attn_mask,
+            )
+        )
+        out = self.ffn_norm(h + self.feed_forward(h))
+        return out
+    def prefill(self, x: Array, attn_mask: Array, kv_cache: KVCache | KVCacheQ):
+        h = self.attention_norm(
+            x
+            + self.attention.prefill(
+                x,
+                kv_cache,
+                attn_mask,
+            )
+        )
+        out = self.ffn_norm(h + self.feed_forward(h))
+        return out
+class TransformerDecoderABC(nn.Module):
+    def __init__(
+        self,
+        hidden_dim: int,
+        n_layer: int,
+        n_head: int,
+        ffn_dim: int,
+        vocab_size: int,
+        max_seq_length: int,
+        max_batch_size: int,
+        *args,
+        **kwds,
+    ) -> None:
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.n_head = n_head
+        assert hidden_dim % n_head == 0
+        self.head_dim = hidden_dim // n_head
+        self.vocab_size = vocab_size
+        self.n_layer = n_layer
+        self.layers: MutableSequence[TransformerBlockABC]
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+    def __call__(
+        self,
+        input_pos: Array,
+        x: Array,
+        kv_caches: MutableSequence[KVCache | KVCacheQ],
+        cache_idx: Array,
+        *args,
+        **kwds,
+    ):
+        for layer, kv_cache in zip(self.layers, kv_caches):
+            x = layer(
+                x,
+                input_pos,
+                kv_cache,
+                cache_idx,
+                *args,
+                **kwds,
+            )
+        return x
+    def prefill(self, x: Array, mask: Array, kv_caches: MutableSequence[KVCache | KVCacheQ]):
+        for layer, kv_cache in zip(self.layers, kv_caches):
+            x = layer.prefill(
+                x,
+                mask,
+                kv_cache,
+            )
+        return x
+class T2SDecoderABC(nn.Module, T2SDecoderProtocol):
+    def __init__(
+        self,
+        config: dict,
+        max_seq_length: int = 2000,
+        max_batch_size: int = 10,
+    ) -> None:
+        super().__init__()
+        hidden_dim: int = config["model"]["hidden_dim"]
+        embedding_dim: int = config["model"]["embedding_dim"]
+        n_head: int = config["model"]["head"]
+        n_layer: int = config["model"]["n_layer"]
+        vocab_size: int = config["model"]["vocab_size"]
+        phoneme_vocab_size: int = config["model"]["phoneme_vocab_size"]
+        EOS: int = config["model"]["EOS"]
+        ffn_dim: int = hidden_dim * 4
+        self.n_layer = int(n_layer)
+        self.hidden_dim = int(hidden_dim)
+        self.n_head = int(n_head)
+        assert hidden_dim % n_head == 0
+        self.head_dim = int(hidden_dim // n_head)
+        self.embedding_dim = int(embedding_dim)
+        self.ffn_dim = int(ffn_dim)
+        self.vocab_size = int(vocab_size)
+        self.phoneme_vocab_size = int(phoneme_vocab_size)
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+        self.EOS = EOS
+        assert self.EOS == self.vocab_size - 1
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
+        self.h: TransformerDecoderABC
+        self.ar_text_embedding = TokenEmbedding(self.embedding_dim, self.phoneme_vocab_size)
+        self.ar_text_position = SinePositionalEmbedding(
+            self.embedding_dim,
+            scale=False,
+            max_batch_size=max_batch_size,
+            max_seq_len=max_seq_length,
+        )
+        self.ar_audio_embedding = TokenEmbedding(self.embedding_dim, self.vocab_size)
+        self.ar_audio_position = SinePositionalEmbedding(
+            self.embedding_dim,
+            scale=False,
+            max_batch_size=max_batch_size,
+            max_seq_len=max_seq_length,
+        )
+        self.kv_class: KVCacheProtocol
+    def init_cache(self, bsz: int = 0, *args, **kwds) -> MutableSequence[KVCache | KVCacheQ]:
+        bsz = bsz or self.h.max_batch_size
+        assert bsz <= self.h.max_batch_size
+        seq_lens = self.h.max_seq_length
+        dtype = self.bert_proj.bias.dtype
+        cache: MutableSequence[KVCache | KVCacheQ] = [
+            self.kv_class.init_cache(bsz, seq_lens, self.n_head, self.head_dim, dtype, *args, **kwds)
+            for _ in range(self.n_layer)
+        ]
+        mx.eval(cache)
+        return cache
+    def embed(
+        self,
+        x: list[Array],
+        y: Array,
+        bert_features: list[Array],
+    ):
+        x_len: list[int] = [cast(tuple[int, ...], i.shape)[0] for i in x]
+        x_len_max = max(x_len)
+        xy_pos = mx.zeros((len(x), x_len_max + cast(tuple[int, ...], y.shape)[1], self.embedding_dim)).astype(
+            bert_features[0].dtype
+        )
+        bert_features = list(map(lambda x: x.swapaxes(0, 1), bert_features))
+        y_len = cast(tuple[int, ...], y.shape)[1]
+        y_emb = self.ar_audio_embedding(y)
+        y_pos = self.ar_audio_position.prefill(y_emb)
+        for bs, (x_, len_, bert_feature) in enumerate(zip(x, x_len, bert_features)):
+            x_emb = self.ar_text_embedding(x_)
+            bert = self.bert_proj(bert_feature)
+            x_emb = x_emb + bert
+            x_pos = self.ar_text_position.prefill(mx.expand_dims(x_emb, 0))
+            xy_pos[[bs], :len_] = x_pos
+            xy_pos[[bs], len_ : len_ + y_len] = y_pos
+        mx.eval(xy_pos)
+        return xy_pos
+    def compile(self):
+        setattr(self.h, "__call__", mx.compile(self.h.__call__))
+        # setattr(self.h, "prefill", mx.compile(self.h.prefill, shapeless=True))
+    def pre_forward(self, session: T2SSessionMLX):
+        attn_mask = session.attn_mask
+        return list(), dict(attn_mask=attn_mask)
+    def post_forward(self, idx: int, session: T2SSessionMLX) -> None:
+        if idx == 0:
+            prefill_len = session.prefill_len
+            bsz = session.bsz
+            range_tensor = mx.arange(self.max_seq_length).reshape(1, 1, 1, self.max_seq_length)
+            prefill_len_expanded = prefill_len.reshape(bsz, 1, 1, 1)
+            attn_mask = range_tensor < prefill_len_expanded
+            attn_mask = mx.repeat(attn_mask, self.n_head, 1)
+            session.attn_mask = attn_mask
+        attn_mask = session.attn_mask
+        input_pos = session.input_pos
+        attn_mask[mx.arange(session.bsz), :, :, input_pos] = True
+        mx.eval(attn_mask)

GPT_SoVITS/Accelerate/PyTorch/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import importlib.util
+import torch
+from .sample_funcs import sample_naive
+from .structs import T2SRequest, T2SResult
+from .t2s_engine import T2SEngine as T2SEngineTorch
+torch.set_grad_enabled(False)
+backends = ["torch_varlen"]
+if torch.cuda.is_available():
+    backends.append("torch_static_cuda_graph")
+    # if importlib.util.find_spec("sageattention") is not None:
+    #     for i in range(torch.cuda.device_count()):
+    #         major, minor = torch.cuda.get_device_capability(i)
+    #         sm_version = major + minor / 10.0
+    #         if sm_version >= 7.0:
+    #             backends.append("sage_attn_varlen_cuda_graph")
+    if importlib.util.find_spec("flash_attn") is not None:
+        for i in range(torch.cuda.device_count()):
+            major, minor = torch.cuda.get_device_capability(i)
+            sm_version = major + minor / 10.0
+            if sm_version >= 7.5:
+                backends.append("flash_attn_varlen_cuda_graph")
+# if torch.mps.is_available():
+#     backends.append("mps_flash_attn_varlen")
+__all__ = ["T2SEngineTorch", "T2SRequest", "sample_naive", "T2SResult", "backends"]

GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+Modified From https://github.com/XXXXRT666/GPT-SoVITS
+"""
+from typing import Dict, List, Tuple
+import kernels
+import torch
+from .. import nn
+from ..structs import T2SSession
+from ..t2s_model_abc import (
+    AttentionABC,
+    CUDAGraphCacheABC,
+    FeedForward,
+    KVCacheNHD,
+    KVCacheProtocol,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+flash_attn_kernel = None
+try:
+    import flash_attn_interface as flash_attn  # type: ignore
+    flash_attn_kernel = flash_attn.flash_attn_with_kvcache
+except ModuleNotFoundError:
+    try:
+        import flash_attn  # type: ignore
+        flash_attn_kernel = flash_attn.flash_attn_with_kvcache
+    except ModuleNotFoundError:
+        pass
+if flash_attn_kernel is None:
+    flash_attn_kernel = kernels.get_kernel("kernels-community/flash-attn").flash_attn_with_kvcache
+Tensor = torch.Tensor
+class Attention(AttentionABC):
+    def __init__(self, n_head, hidden_dim, max_seq_length):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+        self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
+        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, *args, **kwds) -> Tensor:
+        bsz, seqlen, _ = x.shape
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_head, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_head, self.head_dim)
+        attn: Tensor = flash_attn.flash_attn_with_kvcache(  # type: ignore
+            q, kv_cache.k_cache, kv_cache.v_cache, k, v, cache_seqlens=input_pos - 1
+        )
+        attn = attn.view(bsz, seqlen, self.hidden_dim)
+        attn = self.out_proj(attn)
+        return attn
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head, ffn_dim, hidden_dim, max_seq_length) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
+        self.attention_norm = nn.LayerNorm([self.hidden_dim])
+        self.ffn_norm = nn.LayerNorm([self.hidden_dim])
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim,
+        n_layer,
+        n_head,
+        ffn_dim,
+        vocab_size,
+        max_seq_length,
+        max_batch_size,
+    ) -> None:
+        super().__init__(hidden_dim, n_layer, n_head, ffn_dim, vocab_size, max_seq_length, max_batch_size)
+        self.layers = nn.ModuleList(  # type: ignore
+            TransformerBlock(n_head, ffn_dim, hidden_dim, max_seq_length) for _ in range(n_layer)
+        )
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config,
+        max_seq_length=2000,
+        max_batch_size=10,
+    ) -> None:
+        assert torch.cuda.is_available()
+        super().__init__(config, max_seq_length, max_batch_size)
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
+        self.h: TransformerDecoderABC = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+        self.kv_class = KVCacheNHD
+    def post_forward(self, idx: int, session: T2SSession) -> None:
+        return super().post_forward(idx, session)
+    def pre_forward(self, session: T2SSession) -> Tuple[List, Dict]:
+        return super().pre_forward(session)
+class CUDAGraphCache(CUDAGraphCacheABC):
+    def __init__(
+        self,
+        decoder: T2SDecoder,
+    ) -> None:
+        self.is_applicable = True
+        super().__init__(decoder)
+    def release_graph(self, session: T2SSession):
+        if session.id == self.id:
+            self.assigned = False
+        else:
+            del session.graph, session.xy_pos_, session.xy_dec_, session.input_pos, session.kv_cache
+    def get_cache_graph(self, session: T2SSession):
+        assert self.graph
+        session.graph = self.graph
+        session.stream = self.stream
+        session.xy_pos_ = self.xy_pos
+        session.xy_dec_ = self.xy_dec
+        session.input_pos = self.input_pos.copy_(session.input_pos)
+        for cache, cache_ in zip(self.kv_cache, session.kv_cache):
+            cache.sync_cache(cache_)
+    def capture_new_graph(self, session: T2SSession):
+        session.xy_pos_ = self.xy_pos.clone()
+        session.xy_dec_ = self.xy_dec.clone()
+        session.input_pos = self.input_pos.clone().copy_(session.input_pos)
+        args, kwds = self.decoder.pre_forward(session)
+        graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
+        session.graph = graph
+        session.stream = torch.cuda.Stream()  # type: ignore

GPT_SoVITS/Accelerate/PyTorch/backends/mps_flash_attn_varlen.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import torch
+from torch.nn import functional as F
+from .. import nn
+from ..structs import KVCacheProtocol, T2SSession
+from ..t2s_model_abc import (
+    AttentionABC,
+    CUDAGraphCacheABC,
+    FeedForward,
+    KVCacheHND,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+Tensor = torch.Tensor
+class Attention(AttentionABC):
+    def __init__(self, n_head, hidden_dim, max_seq_length):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+        # key, query, value projections for all heads, but in a batch
+        self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
+        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, attn_mask: Tensor):
+        bsz, seqlen, _ = x.shape
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_head, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_head, self.head_dim)
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+        k, v = kv_cache.update(input_pos, k, v)
+        attn = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        attn = attn.transpose(1, 2).contiguous().view(bsz, seqlen, self.hidden_dim)
+        attn = self.out_proj(attn)
+        return attn
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
+        self.attention_norm = nn.LayerNorm([self.hidden_dim])
+        self.ffn_norm = nn.LayerNorm([self.hidden_dim])
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim,
+        n_layer,
+        n_head,
+        ffn_dim,
+        vocab_size,
+        max_seq_length,
+        max_batch_size,
+    ) -> None:
+        super().__init__(hidden_dim, n_layer, n_head, ffn_dim, vocab_size, max_seq_length, max_batch_size)
+        self.layers = nn.ModuleList(  # type: ignore
+            TransformerBlock(n_head, ffn_dim, hidden_dim, max_seq_length) for _ in range(n_layer)
+        )
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config,
+        max_seq_length=2000,
+        max_batch_size=10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
+        self.h: TransformerDecoderABC = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+        self.kv_class = KVCacheHND
+    def pre_forward(self, session: T2SSession):
+        attn_mask = session.attn_mask
+        return list(), dict(attn_mask=attn_mask)
+    def post_forward(self, idx: int, session: T2SSession) -> None:
+        if idx == 0:
+            prefill_len = session.prefill_len
+            bsz = session.bsz
+            range_tensor = torch.arange(self.max_seq_length).view(1, 1, 1, self.max_seq_length)
+            prefill_len_expanded = prefill_len.view(bsz, 1, 1, 1)
+            attn_mask = range_tensor < prefill_len_expanded
+            attn_mask = attn_mask.expand(-1, self.n_head, -1, -1)
+            session.attn_mask = attn_mask
+        attn_mask = session.attn_mask
+        input_pos = session.input_pos
+        attn_mask[torch.arange(session.bsz), :, :, input_pos] = True
+class CUDAGraphCache(CUDAGraphCacheABC):
+    def __init__(
+        self,
+        decoder,
+    ) -> None:
+        self.is_applicable = False
+        super().__init__(decoder)
+        if torch.cuda.is_available():
+            self.attn_mask = (
+                torch.randint(0, 2, (decoder.max_batch_size, decoder.n_head, 1, decoder.max_seq_length))
+                .bool()
+                .to(self.device, self.dtype)
+            )
+    def release_graph(self, session: T2SSession):
+        if session.id == self.id:
+            self.assigned = False
+        else:
+            del (
+                session.graph,
+                session.xy_pos_,
+                session.xy_dec_,
+                session.input_pos,
+                session.kv_cache,
+                session.attn_mask,
+            )
+    def get_cache_graph(self, session: T2SSession):
+        assert self.graph
+        session.graph = self.graph
+        session.stream = self.stream
+        session.xy_pos_ = self.xy_pos
+        session.xy_dec_ = self.xy_dec
+        session.input_pos = self.input_pos.copy_(session.input_pos)
+        session.attn_mask = self.attn_mask
+        for cache, cache_ in zip(self.kv_cache, session.kv_cache):
+            cache.sync_cache(cache_)
+    def capture_new_graph(self, session: T2SSession):
+        session.xy_pos_ = self.xy_pos.clone()
+        session.xy_dec_ = self.xy_dec.clone()
+        session.input_pos = self.input_pos.clone().copy_(session.input_pos)
+        session.attn_mask = self.attn_mask.clone().copy_(session.attn_mask)
+        args, kwds = self.decoder.pre_forward(session)
+        graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
+        session.graph = graph
+        session.stream = torch.cuda.Stream()  # type: ignore

GPT_SoVITS/Accelerate/PyTorch/backends/sage_attn_varlen_cuda_graph.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import sageattention  # type: ignore
+import torch
+from .. import nn
+from ..structs import T2SSession
+from ..t2s_model_abc import (
+    AttentionABC,
+    CUDAGraphCacheABC,
+    FeedForward,
+    KVCacheHND,
+    KVCacheProtocol,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+Tensor = torch.Tensor
+class Attention(AttentionABC):
+    def __init__(self, n_head, hidden_dim, max_seq_length):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+        # key, query, value projections for all heads, but in a batch
+        self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
+        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def __call__(
+        self,
+        x: Tensor,
+        input_pos: Tensor,
+        kv_cache: KVCacheProtocol,
+        cu_seqlens_q: Tensor,
+        cu_seqlens_kv: Tensor,
+    ) -> Tensor:
+        bsz, seqlen, _ = x.shape
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_head, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_head, self.head_dim)
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+        k, v = kv_cache.update(input_pos, k, v)
+        attn: Tensor = sageattention.sageattn_varlen(
+            q,
+            k,
+            v,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_kv=cu_seqlens_kv,
+            max_seqlen_q=1,
+            max_seqlen_k=self.max_seq_length,
+        )
+        attn = attn.transpose(1, 2).contiguous().view(bsz, seqlen, self.hidden_dim)
+        attn = self.out_proj(attn)
+        return attn
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head, ffn_dim, hidden_dim, max_seq_length) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
+        self.attention_norm = nn.LayerNorm([self.hidden_dim])
+        self.ffn_norm = nn.LayerNorm([self.hidden_dim])
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim,
+        n_layer,
+        n_head,
+        ffn_dim,
+        vocab_size,
+        max_seq_length,
+        max_batch_size,
+    ) -> None:
+        super().__init__(hidden_dim, n_layer, n_head, ffn_dim, vocab_size, max_seq_length, max_batch_size)
+        self.layers = nn.ModuleList(  # type: ignore
+            TransformerBlock(n_head, ffn_dim, hidden_dim, max_seq_length) for _ in range(n_layer)
+        )
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config,
+        max_seq_length=2000,
+        max_batch_size=10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
+        self.h: TransformerDecoderABC = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+        self.kv_class = KVCacheHND
+    def pre_forward(self, session: T2SSession) -> tuple[list[Tensor], dict[str, Tensor]]:
+        return list(), dict(cu_seqlens_q=session.cu_seqlens_q, cu_seqlens_kv=session.cu_seqlens_kv)
+    def post_forward(self, idx: int, session: T2SSession):
+        if idx == 0:
+            session.cu_seqlens_q = torch.arange(0, session.bsz + 1, dtype=torch.int32)
+            session.cu_seqlens_kv = torch.cat([torch.tensor(0, dtype=torch.int32), session.input_pos])
+        else:
+            cu_seqlens_q = session.cu_seqlens_q
+            cu_seqlens_kv = session.cu_seqlens_kv
+            cu_seqlens_kv.add_(cu_seqlens_q)
+class CUDAGraphCache(CUDAGraphCacheABC):
+    def __init__(
+        self,
+        decoder: T2SDecoder,
+    ) -> None:
+        self.is_applicable = False
+        super().__init__(decoder)
+        if torch.cuda.is_available():
+            self.cu_seqlens_q = torch.arange(0, decoder.max_batch_size + 1, dtype=torch.int32).to(self.device)
+            self.cu_seqlens_kv = torch.cat([torch.tensor(0, dtype=torch.int32), self.input_pos]).to(self.device)
+    def release_graph(self, session: T2SSession):
+        if session.id == self.id:
+            self.assigned = False
+        else:
+            del (
+                session.graph,
+                session.xy_pos_,
+                session.xy_dec_,
+                session.input_pos,
+                session.kv_cache,
+                session.cu_seqlens_q,
+                session.cu_seqlens_kv,
+            )
+    def get_cache_graph(self, session: T2SSession):
+        assert self.graph
+        session.graph = self.graph
+        session.stream = self.stream
+        session.xy_pos_ = self.xy_pos
+        session.xy_dec_ = self.xy_dec
+        session.input_pos = self.input_pos.copy_(session.input_pos)
+        session.cu_seqlens_q = self.cu_seqlens_q
+        session.cu_seqlens_kv = self.cu_seqlens_kv
+        for cache, cache_ in zip(self.kv_cache, session.kv_cache):
+            cache.sync_cache(cache_)
+    def capture_new_graph(self, session: T2SSession):
+        session.xy_pos_ = self.xy_pos.clone()
+        session.xy_dec_ = self.xy_dec.clone()
+        session.input_pos = self.input_pos.clone().copy_(session.input_pos)
+        session.cu_seqlens_q = self.cu_seqlens_q.clone().copy_(session.cu_seqlens_q)
+        session.cu_seqlens_kv = self.cu_seqlens_kv.clone().copy_(session.cu_seqlens_kv)
+        args, kwds = self.decoder.pre_forward(session)
+        graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
+        session.graph = graph
+        session.stream = torch.cuda.Stream()  # type: ignore

GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import torch
+from torch.nn import functional as F
+from .. import nn
+from ..structs import KVCacheProtocol, T2SSession
+from ..t2s_model_abc import (
+    AttentionABC,
+    CUDAGraphCacheABC,
+    FeedForward,
+    KVCacheHND,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+Tensor = torch.Tensor
+class Attention(AttentionABC):
+    def __init__(self, n_head, hidden_dim, max_seq_length):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+        # key, query, value projections for all heads, but in a batch
+        self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
+        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, attn_mask: Tensor):
+        bsz, seqlen, _ = x.shape
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_head, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_head, self.head_dim)
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+        k, v = kv_cache.update(input_pos, k, v)
+        attn = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        attn = attn.transpose(1, 2).contiguous().view(bsz, seqlen, self.hidden_dim)
+        attn = self.out_proj(attn)
+        return attn
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
+        self.attention_norm = nn.LayerNorm([self.hidden_dim])
+        self.ffn_norm = nn.LayerNorm([self.hidden_dim])
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim,
+        n_layer,
+        n_head,
+        ffn_dim,
+        vocab_size,
+        max_seq_length,
+        max_batch_size,
+    ) -> None:
+        super().__init__(hidden_dim, n_layer, n_head, ffn_dim, vocab_size, max_seq_length, max_batch_size)
+        self.layers = nn.ModuleList(  # type: ignore
+            TransformerBlock(n_head, ffn_dim, hidden_dim, max_seq_length) for _ in range(n_layer)
+        )
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config,
+        max_seq_length=2000,
+        max_batch_size=10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
+        self.h: TransformerDecoderABC = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+        self.kv_class = KVCacheHND
+    def pre_forward(self, session: T2SSession):
+        attn_mask = session.attn_mask
+        return list(), dict(attn_mask=attn_mask)
+    def post_forward(self, idx: int, session: T2SSession) -> None:
+        if idx == 0:
+            prefill_len = session.prefill_len
+            bsz = session.bsz
+            range_tensor = torch.arange(self.max_seq_length).view(1, 1, 1, self.max_seq_length)
+            prefill_len_expanded = prefill_len.view(bsz, 1, 1, 1)
+            attn_mask = range_tensor < prefill_len_expanded
+            attn_mask = attn_mask.expand(-1, self.n_head, -1, -1)
+            session.attn_mask = attn_mask
+        attn_mask = session.attn_mask
+        input_pos = session.input_pos
+        attn_mask[torch.arange(session.bsz), :, :, input_pos] = True
+class CUDAGraphCache(CUDAGraphCacheABC):
+    def __init__(
+        self,
+        decoder,
+    ) -> None:
+        self.is_applicable = True
+        super().__init__(decoder)
+        if torch.cuda.is_available():
+            self.attn_mask = (
+                torch.randint(0, 2, (decoder.max_batch_size, decoder.n_head, 1, decoder.max_seq_length))
+                .bool()
+                .to(self.device, self.dtype)
+            )
+    def release_graph(self, session: T2SSession):
+        if session.id == self.id:
+            self.assigned = False
+        else:
+            del (
+                session.graph,
+                session.xy_pos_,
+                session.xy_dec_,
+                session.input_pos,
+                session.kv_cache,
+                session.attn_mask,
+            )
+    def get_cache_graph(self, session: T2SSession):
+        assert self.graph
+        session.graph = self.graph
+        session.stream = self.stream
+        session.xy_pos_ = self.xy_pos
+        session.xy_dec_ = self.xy_dec
+        session.input_pos = self.input_pos.copy_(session.input_pos)
+        session.attn_mask = self.attn_mask
+        for cache, cache_ in zip(self.kv_cache, session.kv_cache):
+            cache.sync_cache(cache_)
+    def capture_new_graph(self, session: T2SSession):
+        session.xy_pos_ = self.xy_pos.clone()
+        session.xy_dec_ = self.xy_dec.clone()
+        session.input_pos = self.input_pos.clone().copy_(session.input_pos)
+        session.attn_mask = self.attn_mask.clone().copy_(session.attn_mask)
+        args, kwds = self.decoder.pre_forward(session)
+        graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
+        session.graph = graph
+        session.stream = torch.cuda.Stream()  # type: ignore

GPT_SoVITS/Accelerate/PyTorch/backends/torch_varlen.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from typing import NoReturn
+import torch
+from torch.nn import functional as F
+from .. import nn
+from ..structs import KVCacheProtocol, T2SSession
+from ..t2s_model_abc import (
+    AttentionABC,
+    CUDAGraphCacheABC,
+    FeedForward,
+    KVCacheHNDVarlen,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+Tensor = torch.Tensor
+class Attention(AttentionABC):
+    def __init__(self, n_head, hidden_dim, max_seq_length):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+        # key, query, value projections for all heads, but in a batch
+        self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
+        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, attn_mask: Tensor):
+        bsz, seqlen, _ = x.shape
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_head, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_head, self.head_dim)
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+        k, v = kv_cache.update(input_pos, k, v)
+        max_idx = input_pos.max()
+        q, k, v = map(lambda x: x[..., :max_idx, :], (q, k, v))
+        mask = attn_mask[..., :max_idx]
+        attn = F.scaled_dot_product_attention(q, k, v, mask)
+        attn = attn.transpose(1, 2).contiguous().view(bsz, seqlen, self.hidden_dim)
+        attn = self.out_proj(attn)
+        return attn
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
+        self.attention_norm = nn.LayerNorm([self.hidden_dim])
+        self.ffn_norm = nn.LayerNorm([self.hidden_dim])
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim,
+        n_layer,
+        n_head,
+        ffn_dim,
+        vocab_size,
+        max_seq_length,
+        max_batch_size,
+    ) -> None:
+        super().__init__(hidden_dim, n_layer, n_head, ffn_dim, vocab_size, max_seq_length, max_batch_size)
+        self.layers = nn.ModuleList(  # type: ignore
+            TransformerBlock(n_head, ffn_dim, hidden_dim, max_seq_length) for _ in range(n_layer)
+        )
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config,
+        max_seq_length=2000,
+        max_batch_size=10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
+        self.h: TransformerDecoderABC = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+        self.kv_class = KVCacheHNDVarlen
+    def capture(
+        self,
+        *args,
+        **kwds,
+    ) -> NoReturn:
+        raise NotImplementedError("Cuda Graph Is Not Supported For Varlen Model")
+    def pre_forward(self, session: T2SSession):
+        attn_mask = session.attn_mask
+        return list(), dict(attn_mask=attn_mask)
+    def post_forward(self, idx: int, session: T2SSession) -> None:
+        if idx == 0:
+            prefill_len = session.prefill_len
+            bsz = session.bsz
+            range_tensor = torch.arange(self.max_seq_length).view(1, 1, 1, self.max_seq_length)
+            prefill_len_expanded = prefill_len.view(bsz, 1, 1, 1)
+            attn_mask = range_tensor < prefill_len_expanded
+            attn_mask = attn_mask.expand(-1, self.n_head, -1, -1)
+            session.attn_mask = attn_mask
+        attn_mask = session.attn_mask
+        input_pos = session.input_pos
+        attn_mask[torch.arange(session.bsz), :, :, input_pos] = True
+class CUDAGraphCache(CUDAGraphCacheABC):
+    def __init__(
+        self,
+        decoder,
+    ) -> None:
+        self.is_applicable = False
+        super().__init__(decoder)
+    def release_graph(self, session: T2SSession):
+        raise NotImplementedError("Cuda Graph Is Not Supported For Varlen Model")
+    def get_cache_graph(self, session: T2SSession):
+        raise NotImplementedError("Cuda Graph Is Not Supported For Varlen Model")
+    def capture_new_graph(self, session: T2SSession):
+        raise NotImplementedError("Cuda Graph Is Not Supported For Varlen Model")

GPT_SoVITS/Accelerate/PyTorch/export.py ADDED Viewed

	@@ -0,0 +1,467 @@

+import enum
+import os
+import os.path as osp
+import time
+from pathlib import Path
+from typing import MutableSequence, TypeAlias
+import torch
+import typer
+from torch.export import Dim
+from torch.nn import functional as F
+from ..logger import logger
+from . import nn
+from .t2s_model_abc import AttentionABC, FeedForward, T2SDecoderABC, TransformerBlockABC, TransformerDecoderABC
+Tensor = torch.Tensor
+KVCache: TypeAlias = tuple[Tensor, Tensor]
+app = typer.Typer(
+    context_settings={"help_option_names": ["-h", "--help"]},
+    add_completion=False,
+)
+class Stage(str, enum.Enum):
+    embed = "embed"
+    decode = "decode"
+class KVCacheONNX:
+    @staticmethod
+    def empty(kv_cache):
+        assert len(kv_cache) == 2
+        k_cache, v_cache = kv_cache
+        k_cache[:] = 0
+        v_cache[:] = 0
+    @staticmethod
+    def update_cache(
+        input_pos: Tensor, k_val: Tensor, v_val: Tensor, kv_cache: tuple[Tensor, Tensor], cache_idx: Tensor
+    ):
+        # input_pos: [B, ], k_val: [B, H, 1, D]
+        k_out, v_out = kv_cache
+        ip0 = input_pos - 1
+        k_out[cache_idx, :, ip0, None] = k_val
+        v_out[cache_idx, :, ip0, None] = v_val
+        return k_out, v_out
+    @staticmethod
+    def prefill_kv(k_val: Tensor, v_val: Tensor, kv_cache: tuple[Tensor, Tensor]):
+        # k_val: [B, S, H, D]
+        k_cache, v_cache = kv_cache
+        k_cache[..., : k_val.shape[1], :] = k_val.transpose(1, 2)
+        v_cache[..., : v_val.shape[1], :] = v_val.transpose(1, 2)
+    @staticmethod
+    def init_cache(batch_size: int, max_seq_length: int, n_heads: int, head_dim: int, dtype: torch.dtype):
+        cache_shape = (batch_size, n_heads, max_seq_length, head_dim)
+        return (torch.zeros(cache_shape, dtype=dtype), torch.zeros(cache_shape, dtype=dtype))
+class AttentionONNX(AttentionABC):
+    def __init__(self, n_heads: int, head_dim: int, max_seq_length: int):
+        super().__init__(n_heads, head_dim, max_seq_length)
+        self.in_proj = nn.Linear(self.hidden_dim, self.hidden_dim * 3, bias=True)
+        self.out_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=True)
+    def __call__(self, *args, **kwds):  # type: ignore
+        pass
+    def onnx_prefill(self, x: Tensor, kv_cache: KVCache, attn_mask: Tensor) -> Tensor:
+        bsz, seqlen, _ = x.shape
+        torch._check(attn_mask.size(-2) == x.size(-2))
+        q, k, v = self.in_proj(x.unsqueeze(0)).chunk(3, dim=-1)
+        q, k, v = map(lambda x: x.contiguous().view(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+        KVCacheONNX.prefill_kv(k, v, kv_cache)
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+        attn = F.scaled_dot_product_attention(q, k, v, attn_mask)
+        attn = attn.transpose(1, 2).contiguous().view(1, -1, self.hidden_dim)
+        output = self.out_proj(attn)
+        return output
+    def onnx_decode(self, x: Tensor, input_pos: Tensor, kv_cache: KVCache, cache_idx: Tensor, attn_mask: Tensor):
+        bsz, seqlen, _ = x.shape
+        torch._check(attn_mask.size(-2) == 1)
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+        q, k, v = map(lambda x: x.reshape(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+        q, k, v = map(lambda x: x.swapaxes(1, 2), (q, k, v))
+        kv_cache = KVCacheONNX.update_cache(input_pos, k, v, kv_cache, cache_idx)
+        max_idx = int(input_pos.max())
+        q, k, v = map(lambda x: x[..., :max_idx, :], (q, *kv_cache))
+        mask = attn_mask[..., :max_idx]
+        attn = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+        attn = attn.swapaxes(1, 2).reshape(bsz, seqlen, self.hidden_dim)
+        attn = self.out_proj(attn)
+        return attn
+class TransformerBlockONNX(TransformerBlockABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+        self.attention: AttentionONNX = AttentionONNX(n_head, hidden_dim, max_seq_length)  # type: ignore
+        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
+        self.attention_norm = nn.LayerNorm(self.hidden_dim)
+        self.ffn_norm = nn.LayerNorm(self.hidden_dim)
+    def onnx_prefill(self, x: Tensor, attn_mask: Tensor, kv_cache: KVCache):
+        h = self.attention_norm(
+            x
+            + self.attention.onnx_prefill(
+                x,
+                kv_cache,
+                attn_mask,
+            )
+        )
+        out = self.ffn_norm(h + self.feed_forward(h))
+        return out
+    def onnx_decode(self, x: Tensor, input_pos: Tensor, kv_cache: KVCache, cache_idx: Tensor, attn_mask: Tensor):
+        h = self.attention_norm(
+            x
+            + self.attention.onnx_decode(
+                x,
+                input_pos,
+                kv_cache,
+                cache_idx,
+                attn_mask,
+            )
+        )
+        out = self.ffn_norm(h + self.feed_forward(h))
+        return out
+class TransformerDecoderONNX(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim: int,
+        n_layer: int,
+        n_head: int,
+        ffn_dim: int,
+        vocab_size: int,
+        max_seq_length: int,
+        max_batch_size: int,
+    ) -> None:
+        super().__init__(hidden_dim, n_layer, n_head, ffn_dim, vocab_size, max_seq_length, max_batch_size)
+        self.layers: MutableSequence[TransformerBlockONNX] = nn.ModuleList(  # type: ignore
+            TransformerBlockONNX(n_head, ffn_dim, hidden_dim, max_seq_length) for _ in range(n_layer)
+        )
+    def onnx_prefill(self, x: Tensor, mask: Tensor, *kv_caches: KVCache):
+        for layer, kv_cache in zip(self.layers, kv_caches):
+            x = layer.onnx_prefill(
+                x,
+                mask,
+                kv_cache,
+            )
+        return x
+    def onnx_decode(
+        self,
+        input_pos: Tensor,
+        x: Tensor,
+        cache_idx: Tensor,
+        attn_mask: Tensor,
+        *kv_caches: KVCache,
+    ):
+        for layer, kv_cache in zip(self.layers, kv_caches):
+            x = layer.onnx_decode(
+                x,
+                input_pos,
+                kv_cache,
+                cache_idx,
+                attn_mask,
+            )
+        return x
+class T2SDecoderONNX(T2SDecoderABC):
+    def __init__(self, config: dict, max_seq_length: int = 2000, max_batch_size: int = 10) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
+        self.h = TransformerDecoderONNX(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+    def pre_forward(self, session) -> tuple[list[Tensor], dict[str, Tensor]]:
+        return super().pre_forward(session)
+    def post_forward(self, idx: int, session) -> None:
+        return super().post_forward(idx, session)
+    def embed_onnx_(
+        self,
+        x: Tensor,
+        x_len: Tensor,
+        y: torch.Tensor,
+        bert_features: Tensor,
+    ):
+        B = x.shape[0]
+        D = self.embedding_dim
+        T_TOTAL = 500
+        xy_pos = torch.zeros((B, T_TOTAL, D)).to(bert_features[0].dtype)
+        bert_features = bert_features.transpose(1, 2)
+        y_len = y.shape[1]
+        y_emb = self.ar_audio_embedding(y)
+        y_pos = self.ar_audio_position.prefill(y_emb)
+        for bs, x_, len_, bert_feature in zip(torch.arange(x.shape[0]), x, x_len, bert_features):
+            x_emb = self.ar_text_embedding(x_[:len_])
+            bert = self.bert_proj(bert_feature[:len_])
+            print(bert.shape, bert_feature[:len_])
+            return bert, bert_feature[:len_].unsqueeze(0)
+            return bert[:20].unsqueeze(0), None
+            x_emb = x_emb + bert
+            x_pos = self.ar_text_position.prefill(x_emb.unsqueeze(0))
+            xy_pos[None, bs, :len_] = bert
+            # xy_pos[None, bs, len_ : len_ + y_len] = y_pos
+        return xy_pos[:, -1], None
+        return xy_pos[: x.shape[0]], x_len
+    def embed_onnx(
+        self,
+        x: torch.Tensor,  # [B, Tx]
+        x_len: torch.Tensor,  # [B]
+        y: torch.Tensor,  # [1, Ty, D]
+        bert_features: torch.Tensor,  # [B, 1024, Tx]
+    ):
+        # [B, 1024, Tx] -> [B, Tx, 1024]
+        bert_features = bert_features.transpose(1, 2)
+        Ty = y.shape[1]
+        Tx = x.shape[1]
+        B = x.shape[0]
+        D = self.embedding_dim
+        T_TOTAL = 500
+        # mask: [B, Tx]，[j] Col < x_len[i]
+        col = torch.arange(Tx, device=x.device).unsqueeze(0)  # [1, Tx]
+        mask_x = col < x_len.view(-1, 1)  # [B, Tx]
+        mask_x3 = mask_x.unsqueeze(-1)  # [B, Tx, 1]
+        torch._check((Ty >= 0) and (Ty <= 250), "y_len out of range")
+        torch._check((Tx >= 0) and (Tx <= 250), "x_len out of range")
+        y_emb = self.ar_audio_embedding(y)  # [1, Ty, D]
+        y_pos = self.ar_audio_position.prefill(y_emb)  # [1, Ty, D]
+        x_emb_full = self.ar_text_embedding(x)  # [B, Tx, D]
+        bert_full = self.bert_proj(bert_features[[0], : x_len[0]])  # [B, Tx, D]
+        print(bert_full[0].shape, bert_features[0, : x_len[0]])
+        return bert_full[0], bert_features[0, : x_len[0]]
+        x_sum_full = x_emb_full + bert_full  # [B, Tx, D]
+        x_pos_full = self.ar_text_position.prefill(x_sum_full)  # [B, Tx, D]
+        xy_pos = torch.zeros((B, T_TOTAL, D), dtype=x_pos_full.dtype, device=x_pos_full.device)
+        xy_pos[:, :Tx, :] = torch.where(
+            mask_x3,
+            bert_full[:, :Tx, :].to(xy_pos.dtype),
+            xy_pos[:, :Tx, :],
+        )
+        return xy_pos[:, -1], None
+        # Start From offset=x_len, Ty
+        # [Ty] Index: offsets + [0..Ty-1]
+        offsets = x_len.clamp(min=0, max=T_TOTAL - Ty)  # [B]
+        idx_y = offsets.unsqueeze(1) + torch.arange(Ty, device=x_pos_full.device)  # [B, Ty]
+        # scatter to dim=1
+        # expand index to [B, Ty, D]
+        idx_y3 = idx_y.unsqueeze(-1).expand(B, Ty, D)
+        y_pos_b = y_pos.expand(B, Ty, D).to(xy_pos.dtype)  # [B, Ty, D]
+        xy_pos = xy_pos.scatter(1, idx_y3, y_pos_b)
+        return xy_pos, x_len
+def torchscript_export(model: T2SDecoderONNX, stage="embed"):
+    if stage == "embed":
+        x = torch.randint(1, 600, (model.max_batch_size, 50))
+        x_len = torch.randint(30, 50, (model.max_batch_size,))
+        y = torch.randint(1, 600, (1, 200))
+        bert_features = torch.rand((model.max_batch_size, 1024, 50))
+        x_len[-1] = 50
+        mask = torch.arange(x_len.max().item(), device=x.device).unsqueeze(0) < x_len.unsqueeze(1)
+        x = x * mask
+        bert_features = bert_features * mask.unsqueeze(1)
+        try:
+            a, c = model.embed_onnx_(x, x_len, y, bert_features)
+            b, d = model.embed_onnx(x, x_len, y, bert_features)
+            print("-" * 20)
+            print(a - b, (a - b).sum(), (a - b).square().mean())
+            print(c - d, (c - d).sum(), (c - d).square().mean())
+            exit()
+            assert torch.allclose(a, b, atol=1e-6, rtol=1e-8), (a - b).square().mean()
+            setattr(model, "forward", model.embed_onnx)
+            scripted_model = torch.jit.script(model, example_inputs=[(x, x_len, y, bert_features)])
+            onnx_program = torch.onnx.export(
+                scripted_model,
+                (x, x_len, y, bert_features),
+                input_names=["text", "text_len", "prompt", "bert_features"],
+                output_names=["xy_pos", "input_pos"],
+                dynamic_axes={
+                    "text": {0: "Batch_Size", 1: "Sequence_Length_X"},
+                    "prompt": {0: "Batch_Size", 1: "Sequence_Length_Y"},
+                    "bert_features": {0: "Batch_Size", 1: "Sequence_Length_X"},
+                },
+                opset_version=21,
+                training=False,
+                do_constant_folding=True,
+                external_data=False,
+            )
+            assert onnx_program
+            onnx_program.save("onnx_export/AR_Embedding_TorchScript.onnx")
+        except Exception:
+            logger.bind(show_locals=False).exception("")
+def dynamo_export(model: T2SDecoderONNX, stage="embed"):
+    if stage == "embed":
+        x = torch.randint(1, 600, (model.max_batch_size, 50))
+        x_len = torch.randint(30, 50, (model.max_batch_size,))
+        y = torch.randint(1, 600, (1, 200))
+        bert_features = torch.rand((model.max_batch_size, 1024, 50))
+        x_len[-1] = 50
+        mask = torch.arange(x_len.max().item(), device=x.device).unsqueeze(0) < x_len.unsqueeze(1)
+        x = x * mask
+        bert_features = (bert_features.transpose(1, 2) * mask.unsqueeze(-1)).transpose(1, 2)
+        dynamic_shapes = [
+            {
+                0: Dim("Batch_Size", min=1, max=4),
+                1: Dim("Sequence_Length_X", min=1, max=50),
+            },
+            {
+                0: Dim("Batch_Size", min=1, max=4),
+            },
+            {
+                1: Dim("Sequence_Length_Y", min=1, max=250),
+            },
+            {
+                0: Dim("Batch_Size", min=1, max=4),
+                2: Dim("Sequence_Length_X", min=1, max=50),
+            },
+        ]
+        try:
+            a = model.embed_onnx_(x, x_len, y, bert_features)[0]
+            b = model.embed_onnx(x, x_len, y, bert_features)[0]
+            print(a - b, (a - b).square().mean())
+            exit()
+            assert torch.allclose(a, b, atol=1e-6, rtol=1e-8), (a - b).square().mean()
+            setattr(model, "forward", model.embed_onnx)
+            onnx_program = torch.onnx.export(
+                model,
+                (x, x_len, y, bert_features),
+                input_names=["text", "text_len", "prompt", "bert_features"],
+                output_names=["xy_pos", "input_pos"],
+                dynamo=True,
+                dynamic_shapes=dynamic_shapes,
+                opset_version=21,
+                training=False,
+                do_constant_folding=True,
+                external_data=False,
+            )
+            assert onnx_program
+            onnx_program.save("onnx_export/AR_Embedding_Dynamo.onnx")
+        except Exception:
+            logger.bind(show_locals=False).exception("")
+@app.command()
+def export(
+    ckpt_path: Path = typer.Option(
+        ...,
+        "--ckpt-path",
+        file_okay=True,
+        dir_okay=False,
+        exists=True,
+        readable=True,
+        show_default=False,
+        help="AR Checkpoint",
+    ),
+    dynamo: bool = typer.Option(False, is_flag=True, flag_value=True, help="Use Torch Dynamo"),
+    stages: list[Stage] = typer.Option([Stage.embed], "--stages", help="Stage to export"),
+):
+    os.makedirs("onnx_export", exist_ok=True)
+    dict_s1 = torch.load(ckpt_path, "cpu", mmap=True)
+    condig = dict_s1["config"]
+    model = T2SDecoderONNX(condig, 2000, 4)
+    state_dict = dict_s1["weight"]
+    model.load_state_dict(state_dict)
+    for stage in stages:
+        if dynamo:
+            dynamo_export(model, stage)
+        else:
+            torchscript_export(model, stage)
+def get_prog_name() -> str:
+    script_rel = ".".join(["GPT_SoVITS", "Accelerate", "PyTorch", osp.basename(__file__)]).strip(".py")
+    return f"python -s -m {script_rel}"
+if __name__ == "__main__":
+    t = time.perf_counter()
+    app(prog_name=get_prog_name())
+    logger.info(f"Exec Time: {time.perf_counter() - t:.2f} secs")

GPT_SoVITS/Accelerate/PyTorch/nn.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+Enhanced Type Hint nn.Module
+Modified From https://github.com/labmlai/labml/blob/master/helpers/labml_helpers/module.py
+"""
+from typing import Any
+import torch.nn
+from torch.nn import (
+    functional as functional,
+)
+from torch.nn import (
+    utils as utils,
+)
+from torch.nn.modules import *  # type: ignore # noqa: F403
+from torch.nn.parameter import (
+    Parameter as Parameter,
+)
+Tensor = torch.Tensor
+class Module(torch.nn.Module):
+    r"""
+    Wraps ``torch.nn.Module`` to overload ``__call__`` instead of
+    ``forward`` for better type checking.
+    `PyTorch Github issue for clarification <https://github.com/pytorch/pytorch/issues/44605>`_
+    """
+    def _forward_unimplemented(self, *input: Any) -> None:
+        # To stop PyTorch from giving abstract methods warning
+        pass
+    def __init_subclass__(cls, **kwargs):
+        if cls.__dict__.get("__call__", None) is None:
+            return
+        setattr(cls, "forward", cls.__dict__["__call__"])
+        delattr(cls, "__call__")
+    @property
+    def device(self) -> torch.device:
+        params = self.parameters()
+        try:
+            sample_param = next(params)
+            return sample_param.device
+        except StopIteration:
+            raise RuntimeError(f"Unable to determine device of {self.__class__.__name__}") from None
+class Linear(torch.nn.Linear):
+    def __call__(self, input: Tensor) -> Tensor:
+        return super().__call__(input)
+class Dropout(torch.nn.Dropout):
+    def __call__(self, input: Tensor) -> Tensor:
+        return super().__call__(input)
+class Embedding(torch.nn.Embedding):
+    def __call__(self, input: Tensor) -> Tensor:
+        return super().__call__(input)
+class LayerNorm(torch.nn.LayerNorm):
+    def __call__(self, input: Tensor) -> Tensor:
+        return super().__call__(input)

GPT_SoVITS/Accelerate/PyTorch/sample_funcs.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from typing import Protocol
+import torch
+import torch.nn.functional as F
+Tensor = torch.Tensor
+class SampleProtocol(Protocol):
+    @staticmethod
+    def __call__(
+        logits: Tensor,
+        previous_tokens: Tensor,
+        temperature: float,
+        top_k: int,
+        top_p: float,
+        repetition_penalty: float,
+    ) -> Tensor: ...
+class sample_naive(SampleProtocol):
+    @staticmethod
+    def __call__(
+        logits: Tensor,
+        previous_tokens: Tensor,
+        temperature: float,
+        top_k: int,
+        top_p: float,
+        repetition_penalty: float,
+    ):
+        if temperature <= 1e-5:
+            probs = F.softmax(logits, dim=-1)
+            return torch.argmax(probs, dim=-1, keepdim=True).to(dtype=torch.int32)
+        if repetition_penalty != 1.0:
+            previous_tokens = previous_tokens.long()
+            score = torch.gather(logits, dim=1, index=previous_tokens)
+            score = torch.where(
+                score < 0,
+                score * repetition_penalty,
+                score / repetition_penalty,
+            )
+            logits.scatter_(dim=1, index=previous_tokens, src=score)
+        if top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+            cum_probs[cum_probs > 1] = 1
+            sorted_indices_to_remove = cum_probs > top_p
+            sorted_indices_to_remove[:, 0] = False  # keep at least one option
+            indices_to_remove = sorted_indices_to_remove.scatter(
+                dim=1, index=sorted_indices, src=sorted_indices_to_remove
+            )
+            logits = logits.masked_fill(indices_to_remove, -float("Inf"))
+        if temperature < 1.0:
+            logits /= temperature
+        v, _ = torch.topk(logits, top_k)
+        pivot = v[:, -1].unsqueeze(-1)
+        logits = torch.where(logits < pivot, -float("Inf"), logits)
+        probs = F.softmax(logits, dim=-1)
+        q = -torch.log(torch.rand_like(probs))
+        idx_next = torch.argmax(probs / q, dim=-1, keepdim=True).to(dtype=torch.int32)
+        return idx_next

GPT_SoVITS/Accelerate/PyTorch/structs.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""
+Modified From https://github.com/XXXXRT666/GPT-SoVITS
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Literal, MutableSequence, Optional, Protocol
+import torch
+from .sample_funcs import SampleProtocol, sample_naive
+Tensor = torch.Tensor
+@dataclass
+class T2SResult:
+    result: list[Tensor] | None = None
+    infer_speed: tuple[float, float] = (0.0, 0.0)
+    status: Literal["Success", "Error"] = "Success"
+    exception: Optional[Exception] = None
+    traceback: Optional[str] = None
+@dataclass
+class T2SRequest:
+    x: list[torch.Tensor]
+    x_lens: Tensor
+    prompts: torch.Tensor
+    bert_feature: list[Tensor]
+    valid_length: int
+    top_k: int = 5
+    top_p: float = 1
+    early_stop_num: int = -1
+    temperature: float = 1.0
+    repetition_penalty: float = 1.35
+    use_cuda_graph: bool = False
+    debug: bool = False
+class KVCacheProtocol(Protocol):
+    k_cache: Tensor
+    v_cache: Tensor
+    def __init__(self, batch_size: int, max_seq_length: int, n_heads: int, head_dim: int) -> None: ...
+    def empty(self) -> None: ...
+    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor, *args, **kwds) -> tuple[Tensor, Tensor]: ...
+    def prefill_kv(self, k_val: Tensor, v_val: Tensor) -> None: ...
+    def sync_cache(self, kv_cache: KVCacheProtocol) -> None: ...
+class T2SDecoderProtocol(Protocol):
+    max_seq_length: int
+    EOS: int
+    n_head: int
+    @property
+    def device(self) -> torch.device: ...
+    def embed(self, x: list[Tensor], y: Tensor, bert_features: list[Tensor]) -> Tensor: ...
+class T2SEngineProtocol(Protocol):
+    def _handle_request(self, request: T2SRequest) -> tuple[list[Tensor], float, float]: ...
+    def generate(self, request: T2SRequest) -> T2SResult: ...
+class T2SSession:
+    def __init__(
+        self,
+        decoder: T2SDecoderProtocol,
+        request: T2SRequest,
+        sapmle_func: type[SampleProtocol] = sample_naive,
+        device: torch.device = torch.device("cpu"),
+        dtype: torch.dtype = torch.float32,
+    ):
+        with device:
+            self.decoder = decoder
+            self.request = request
+            self.device = device
+            self.dtype = dtype
+            bsz = len(request.x)
+            y_len = request.prompts.size(-1)
+            self.bsz = bsz
+            self.y_len = y_len
+            request.prompts = request.prompts.to(device, torch.int32)
+            # Cache
+            self.kv_cache: MutableSequence[KVCacheProtocol]
+            self.sample = sapmle_func()
+            # Forward args
+            self.x = [i.to(device) for i in request.x]
+            self.x_lens = request.x_lens.to(torch.int32)
+            self.y = torch.zeros((bsz, decoder.max_seq_length)).to(torch.int32)
+            self.y[:, : request.prompts.shape[-1]] = request.prompts
+            self.bert_feature = [i.to(device, dtype) for i in request.bert_feature]
+            self.prefill_len = self.x_lens + request.prompts.size(1)
+            self.input_pos = torch.zeros_like(self.prefill_len)
+            self.input_pos.add_(self.prefill_len)
+            # CUDA Graph
+            self.stream: Optional[torch.cuda.Stream] = None
+            self.graph: Optional[torch.cuda.CUDAGraph] = None
+            self.xy_pos_: Tensor
+            self.xy_dec_: Tensor
+            # EOS
+            self.completed = torch.Tensor([False] * len(self.x)).bool().to(device)
+            self.y_results: list[Tensor] = [None] * len(self.x)  # type: ignore
+            self.xy_pos = decoder.embed(self.x, request.prompts, self.bert_feature)
+            max_len = int(self.prefill_len.max().item())
+            attn_mask = torch.zeros(size=(bsz, max_len, max_len), dtype=torch.bool)
+            for bs in range(bsz):
+                pos = int(self.x_lens[bs])
+                seq_len = pos + y_len
+                attn_mask[bs, :seq_len, :pos] = True
+                ar_mask = ~torch.triu(
+                    input=torch.ones(
+                        size=(
+                            y_len,
+                            y_len,
+                        ),
+                        dtype=torch.bool,
+                    ),
+                    diagonal=1,
+                )
+                attn_mask[bs, pos:seq_len, pos:seq_len] = ar_mask
+            self.attn_mask = attn_mask
+            self.attn_mask = attn_mask.unsqueeze(0).expand(-1, decoder.n_head, -1, -1)
+            self.id: int = -1
+            # Sage Attn & Transformer Engine Impl
+            self.cu_seqlens_q: Tensor
+            self.cu_seqlens_kv: Tensor

GPT_SoVITS/Accelerate/PyTorch/t2s_engine.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import contextlib
+import gc
+import os
+import sys
+import time
+import traceback
+from importlib import import_module
+import torch
+from rich.progress import BarColumn, Progress, TextColumn
+from ..logger import SpeedColumnToken, console, logger
+from .structs import T2SEngineProtocol, T2SRequest, T2SResult, T2SSession
+from .t2s_model_abc import (
+    CUDAGraphCacheABC,
+    T2SDecoderABC,
+    TorchProfiler,
+)
+class T2SEngine(T2SEngineProtocol):
+    def __init__(
+        self,
+        decoder_model: T2SDecoderABC,
+        device: torch.device = torch.device("cpu"),
+        dtype: torch.dtype = torch.float32,
+    ) -> None:
+        assert device.type in {"cpu", "cuda", "mps", "xpu", "mtia"}
+        assert dtype in {torch.float16, torch.bfloat16, torch.float32}
+        self.device = device
+        self.dtype = dtype
+        self.decoder_model: T2SDecoderABC = decoder_model.to(self.device, self.dtype)
+        self.graphcache: CUDAGraphCacheABC = self.init_cache()
+    def _handle_request(self, request: T2SRequest):
+        with self.device:
+            decoder = self.decoder_model
+            session = T2SSession(decoder, request, device=self.device, dtype=self.dtype)
+            batch_idx = torch.arange(session.bsz)
+            t1 = 0.0
+            infer_speed = 0.0
+            infer_time = 0.0
+            torch_profiler = TorchProfiler(request.debug)
+            with (
+                torch_profiler.profiler(),
+                Progress(
+                    TextColumn("[cyan]{task.description}"),
+                    BarColumn(),
+                    TextColumn("{task.completed}/{task.total} tokens"),
+                    SpeedColumnToken(show_speed=True),
+                    console=console,
+                    transient=True,
+                ) as progress,
+            ):
+                max_token = int(min(2000 - session.input_pos.max(), 1500))
+                task = progress.add_task("T2S Decoding", total=max_token)
+                for idx in range(max_token):
+                    progress.update(task, advance=1)
+                    if idx == 0:
+                        session.kv_cache = decoder.init_cache(session.bsz)
+                        xy_dec = decoder.h.prefill(session.xy_pos, session.kv_cache, session.attn_mask)
+                        xy_dec = xy_dec[None, batch_idx, session.input_pos - 1]
+                    else:
+                        if (
+                            request.use_cuda_graph
+                            and session.graph is None
+                            and self.graphcache.is_applicable
+                            and torch.cuda.is_available()
+                        ):
+                            self.graphcache.assign_graph(session)
+                        with torch_profiler.record("AR"):
+                            if session.graph:
+                                assert session.stream
+                                session.stream.wait_stream(torch.cuda.default_stream())
+                                with torch.cuda.stream(session.stream):
+                                    session.xy_pos_.copy_(session.xy_pos)
+                                    session.graph.replay()
+                                    xy_dec = session.xy_dec_.clone()
+                            else:
+                                args, kwds = decoder.pre_forward(session)
+                                xy_dec = decoder.h(
+                                    session.input_pos,
+                                    session.xy_pos,
+                                    session.kv_cache,
+                                    *args,
+                                    **kwds,
+                                )
+                    with torch.cuda.stream(session.stream) if session.stream is not None else contextlib.nullcontext():
+                        decoder.post_forward(idx, session)
+                        logits = decoder.ar_predict_layer(xy_dec[:, -1])
+                        if idx == 0:
+                            logits[:, -1] = float("-inf")
+                        with torch_profiler.record("Sampling"):
+                            samples = session.sample(
+                                logits=logits,
+                                previous_tokens=session.y[:, : session.y_len + idx],
+                                top_k=request.top_k,
+                                top_p=request.top_p,
+                                repetition_penalty=request.repetition_penalty,
+                                temperature=request.temperature,
+                            )
+                            session.y[batch_idx, session.y_len + idx] = samples
+                            session.input_pos.add_(1)
+                        with torch_profiler.record("EOS"):
+                            argmax_token = torch.argmax(logits, dim=-1)
+                            sample_token = samples.squeeze(1)
+                            EOS_mask = (argmax_token == decoder.EOS) | (sample_token == decoder.EOS)
+                            newly_done_mask = EOS_mask & (~session.completed)
+                            newly_done_indices = newly_done_mask.nonzero()
+                            if newly_done_indices.numel() > 0:
+                                for i in newly_done_indices:
+                                    session.y_results[i] = session.y[i, session.y_len : session.y_len + idx]
+                                    session.completed[newly_done_indices] = True
+                            if torch.all(session.completed).item():
+                                if session.y[:, session.y_len :].sum() == 0:
+                                    session.y_results = [torch.tensor(0) for _ in range(session.bsz)]
+                                    logger.error("Bad Zero Prediction")
+                                else:
+                                    logger.info(
+                                        f"T2S Decoding EOS {session.prefill_len.tolist().__str__().strip('[]')} -> {[i.size(-1) for i in session.y_results].__str__().strip('[]')}"
+                                    )
+                                    logger.info(f"Infer Speed: {(idx - 1) / (time.perf_counter() - t1):.2f} token/s")
+                                    infer_time = time.perf_counter() - t1
+                                    infer_speed = (idx - 1) / infer_time
+                                break
+                            if (request.early_stop_num != -1 and idx >= request.early_stop_num) or idx == max_token - 1:
+                                for i in range(session.bsz):
+                                    if not session.completed[i].item():
+                                        session.y_results[i] = session.y[i, session.y_len : session.y_len + 1499]
+                                        session.completed[i] = True
+                                    logger.error("Bad Full Prediction")
+                                break
+                        with torch_profiler.record("NextPos"):
+                            y_emb = decoder.ar_audio_embedding(samples)
+                            session.xy_pos = decoder.ar_audio_position(session.input_pos - session.x_lens, y_emb)
+                        if idx == 1:
+                            torch_profiler.start()
+                            t1 = time.perf_counter()
+                        if idx == 51:
+                            torch_profiler.end()
+                        if idx % 100 == 0:
+                            match session.device.type:
+                                case "cuda":
+                                    torch.cuda.empty_cache()
+                                case "mps":
+                                    torch.mps.empty_cache()
+                                case "xpu":
+                                    torch.xpu.empty_cache()
+                                case "mtia":
+                                    torch.mtia.empty_cache()
+            match session.device.type:
+                case "cuda":
+                    if session.stream is not None:
+                        torch.cuda.current_stream().wait_stream(session.stream)
+                    torch.cuda.empty_cache()
+                case "mps":
+                    torch.mps.empty_cache()
+                case "xpu":
+                    torch.xpu.empty_cache()
+                case "mtia":
+                    torch.mtia.empty_cache()
+                case "cpu":
+                    gc.collect()
+            torch_profiler.end()
+            if request.use_cuda_graph and torch.cuda.is_available():
+                self.graphcache.release_graph(session)
+            return session.y_results[: request.valid_length], infer_speed, infer_time
+    def generate(self, request: T2SRequest):
+        try:
+            result, infer_speed, infer_time = self._handle_request(request)
+            t2s_result = T2SResult(result=result, infer_speed=(infer_speed, infer_time), status="Success")
+        except Exception as e:
+            t2s_result = T2SResult(status="Error", exception=e, traceback=traceback.format_exc())
+        return t2s_result
+    @staticmethod
+    def load_decoder(weights_path: os.PathLike, max_batch_size: int = 1, backend: str = "Flash-Attn-Varlen-CUDAGraph"):
+        logger.info(f"Loading Text2Semantic Weights from {weights_path} with {backend} Backend")
+        module_path = f".backends.{backend.lower().replace('-', '_').replace('cudagraph', 'cuda_graph')}"
+        decoder_cls_name = "T2SDecoder"
+        decoder_mod = import_module(module_path, package=__package__)
+        decoder_cls: type[T2SDecoderABC] = getattr(decoder_mod, decoder_cls_name)
+        dict_s1 = torch.load(weights_path, map_location="cpu", weights_only=False, mmap=True)
+        config = dict_s1["config"]
+        decoder: T2SDecoderABC = decoder_cls(config, max_batch_size=max_batch_size)
+        state_dict = dict_s1["weight"]
+        decoder.load_state_dict(state_dict)
+        return decoder.eval()
+    def init_cache(self):
+        assert self.decoder_model
+        module_name = self.decoder_model.__class__.__module__
+        module = sys.modules.get(module_name)
+        assert module
+        target_class: type[CUDAGraphCacheABC] = getattr(module, "CUDAGraphCache")
+        return target_class(self.decoder_model)

GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py ADDED Viewed

	@@ -0,0 +1,672 @@

+"""
+Modified From https://github.com/XXXXRT666/GPT-SoVITS
+"""
+from __future__ import annotations
+import math
+import os
+import random
+from abc import ABC, abstractmethod
+from contextlib import nullcontext
+from typing import MutableSequence
+import torch
+import torch._inductor.config
+import torch.nn.functional as F
+from torch.cuda.graphs import CUDAGraph
+from torch.profiler import ProfilerAction, tensorboard_trace_handler
+from . import nn
+from .structs import KVCacheProtocol, T2SDecoderProtocol, T2SSession
+Tensor = torch.Tensor
+class TokenEmbedding(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        vocab_size: int,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
+    @property
+    def weight(self) -> Tensor:
+        return self.word_embeddings.weight
+    def embedding(self, index: int) -> Tensor:
+        return self.word_embeddings.weight[index : index + 1]
+    def __call__(self, x: Tensor):
+        x = self.word_embeddings(x)
+        return x
+class SinePositionalEmbedding(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        scale: bool = False,
+        alpha: bool = False,
+        max_batch_size: int = 10,
+        max_seq_len: int = 2000,
+    ):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
+        self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
+        self.max_batch_size = max_batch_size
+        self.max_seq_len = max_seq_len
+        self.reverse = False
+        self.register_buffer("pe", torch.zeros(max_batch_size, max_seq_len, embedding_dim), persistent=False)
+        self.pe: torch.Tensor
+        self.compute_pe()
+    def compute_pe(self):
+        """Reset the positional encodings."""
+        if self.reverse:
+            position = torch.arange(self.max_seq_len - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
+        else:
+            position = torch.arange(self.max_seq_len, dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embedding_dim)
+        )
+        pe = self.pe
+        pe[:, :, 0::2] = torch.sin(position * div_term)
+        pe[:, :, 1::2] = torch.cos(position * div_term)
+    def __call__(self, input_pos: Tensor, x: Tensor) -> Tensor:
+        """
+        Args:
+            input_pos (Tensor): [batch_size, ]
+            x (Tensor): [batch_size, 1, embed_dim]
+        Returns:
+            embedded_x (Tensor): [batch_size, 1, embed_dim]
+        """
+        batch_size = x.shape[0]
+        pe_values = self.pe[torch.arange(batch_size), input_pos - 1]  # (batch_size, embed_dim)
+        return x * self.x_scale + self.alpha * pe_values.unsqueeze(1)  # (batch_size, 1, embed_dim)
+    def prefill(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): [batch_size, seq_len, embed_dim]
+        Returns:
+            embedded_x (Tensor): [batch_size, seq_len, embed_dim]
+        """
+        batch_size = x.shape[0]
+        pe_values = self.pe[:batch_size, : x.shape[-2]]
+        return x * self.x_scale + self.alpha * pe_values
+class KVCacheABC(nn.Module, ABC, KVCacheProtocol):
+    def __init__(self, batch_size: int, max_seq_length: int, n_heads: int, head_dim: int) -> None:
+        super().__init__()
+        self.n_head = n_heads
+        self.head_dim = head_dim
+        self.batch_size = batch_size
+        self.max_seq_length = max_seq_length
+        self.k_cache: Tensor
+        self.v_cache: Tensor
+    def empty(self):
+        self.k_cache.zero_()
+        self.v_cache.zero_()
+    @abstractmethod
+    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor, *args, **kwds) -> tuple[Tensor, Tensor]: ...
+    @abstractmethod
+    def prefill_kv(self, k_val: Tensor, v_val: Tensor) -> None: ...
+    def sync_cache(self, kv_cache: KVCacheProtocol):
+        self.k_cache.copy_(kv_cache.k_cache)
+        self.v_cache.copy_(kv_cache.v_cache)
+class KVCacheNHD(KVCacheABC):
+    def __init__(self, batch_size, max_seq_length, n_heads, head_dim):
+        super().__init__(batch_size, max_seq_length, n_heads, head_dim)
+        assert batch_size > 0
+        cache_shape = (batch_size, max_seq_length, n_heads, head_dim)
+        self.register_buffer("k_cache", torch.zeros(size=cache_shape), persistent=False)
+        self.register_buffer("v_cache", torch.zeros(size=cache_shape), persistent=False)
+    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor):
+        # input_pos: [B, ], k_val: [B, 1, H, D]
+        index = (
+            (input_pos - 1)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .expand(
+                -1,
+                -1,
+                self.n_head,
+                self.head_dim,
+            )
+            .to(torch.int64)
+        )  # (bs, 1, num_head, head_dim)
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out.scatter_(1, index, k_val)
+        v_out.scatter_(1, index, v_val)
+        return k_out, v_out
+    def empty(self):
+        self.k_cache.zero_()
+        self.v_cache.zero_()
+    def prefill_kv(self, k_val: Tensor, v_val: Tensor):
+        # input_pos: int, k_val: [B, S, H, D]
+        self.k_cache[:, : k_val.shape[1]] = k_val
+        self.v_cache[:, : v_val.shape[1]] = v_val
+class KVCacheHND(KVCacheABC):
+    def __init__(self, batch_size, max_seq_length, n_heads, head_dim):
+        super().__init__(batch_size, max_seq_length, n_heads, head_dim)
+        cache_shape = (batch_size, n_heads, max_seq_length, head_dim)
+        self.register_buffer("k_cache", torch.zeros(size=cache_shape), persistent=False)
+        self.register_buffer("v_cache", torch.zeros(size=cache_shape), persistent=False)
+    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor):
+        # input_pos: [B, ], k_val: [B, H, 1, D]
+        index = (
+            (input_pos - 1)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .expand(
+                -1,
+                self.n_head,
+                -1,
+                self.head_dim,
+            )
+            .to(torch.int64)
+        )  # (bs, num_head, 1, head_dim)
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out.scatter_(2, index, k_val)
+        v_out.scatter_(2, index, v_val)
+        return k_out, v_out
+    def empty(self):
+        self.k_cache.zero_()
+        self.v_cache.zero_()
+    def prefill_kv(self, k_val: Tensor, v_val: Tensor):
+        # input_pos: int, k_val: [B, S, H, D]
+        self.k_cache[..., : k_val.shape[1], :] = k_val.transpose(1, 2)
+        self.v_cache[..., : v_val.shape[1], :] = v_val.transpose(1, 2)
+class KVCacheHNDVarlen(KVCacheABC):
+    def __init__(self, batch_size, max_seq_length, n_heads, head_dim):
+        super().__init__(batch_size, max_seq_length, n_heads, head_dim)
+        cache_shape = (batch_size, n_heads, max_seq_length, head_dim)
+        self.cache_idx: Tensor
+        self.register_buffer("cache_idx", torch.arange(batch_size), persistent=False)
+        self.register_buffer("k_cache", torch.zeros(size=cache_shape), persistent=False)
+        self.register_buffer("v_cache", torch.zeros(size=cache_shape), persistent=False)
+    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor):
+        # input_pos: [B, ], k_val: [B, H, 1, D]
+        k_out = self.k_cache
+        v_out = self.v_cache
+        ip0 = input_pos - 1
+        k_out[self.cache_idx, :, ip0, None] = k_val
+        v_out[self.cache_idx, :, ip0, None] = v_val
+        return k_out, v_out
+    def empty(self):
+        self.k_cache.zero_()
+        self.v_cache.zero_()
+    def prefill_kv(self, k_val: Tensor, v_val: Tensor):
+        # input_pos: int, k_val: [B, S, H, D]
+        self.k_cache[..., : k_val.shape[1], :] = k_val.transpose(1, 2)
+        self.v_cache[..., : v_val.shape[1], :] = v_val.transpose(1, 2)
+class AttentionABC(nn.Module, ABC):
+    def __init__(self, n_head: int, hidden_dim: int, max_seq_length: int):
+        super().__init__()
+        self.n_head = n_head
+        self.hidden_dim = hidden_dim
+        assert hidden_dim % n_head == 0
+        self.head_dim = hidden_dim // n_head
+        self.max_seq_length = max_seq_length
+        # key, query, value projections for all heads, but in a batch
+        self.in_proj: nn.Linear
+        self.out_proj: nn.Linear
+        self._register_load_state_dict_pre_hook(self.load_hook)
+    def load_hook(self, state_dict: dict[str, Tensor], prefix, *args):
+        keys_to_modify = [key for key in state_dict if "in_proj_" in key]
+        for key in keys_to_modify:
+            new_key = key.replace("in_proj_", "in_proj.")  # in_proj_ -> in_proj.
+            state_dict[new_key] = state_dict.pop(key)
+    @abstractmethod
+    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, *args, **kwds) -> Tensor: ...
+    def prefill(self, x: Tensor, kv_cache: KVCacheProtocol, attn_mask: Tensor) -> Tensor:
+        bsz, seqlen, _ = x.shape
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+        q, k, v = map(lambda x: x.contiguous().view(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+        kv_cache.prefill_kv(k, v)
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+        attn = F.scaled_dot_product_attention(q, k, v, attn_mask)
+        attn = attn.transpose(1, 2).contiguous().view(1, -1, self.hidden_dim)
+        output = self.out_proj(attn)
+        return output
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int) -> None:
+        super().__init__()
+        self.linear1 = nn.Linear(dim, hidden_dim, bias=True)
+        self.linear2 = nn.Linear(hidden_dim, dim, bias=True)
+    def __call__(self, x: Tensor):
+        return self.linear2(F.relu(self.linear1(x), inplace=True))
+class TransformerBlockABC(nn.Module, ABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.max_seq_length = max_seq_length
+        self.attention: AttentionABC
+        self.feed_forward: FeedForward
+        self.attention_norm: nn.LayerNorm
+        self.ffn_norm: nn.LayerNorm
+        self._register_load_state_dict_pre_hook(self.load_hook)
+    def load_hook(self, state_dict: dict[str, Tensor], prefix, *args):
+        for key in list(state_dict.keys()):
+            new_key = (
+                key.replace("self_attn", "attention")
+                .replace("linear", "feed_forward.linear")
+                .replace("norm1", "attention_norm")
+                .replace("norm2", "ffn_norm")
+            )
+            state_dict[new_key] = state_dict.pop(key)
+    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, *args, **kwds):
+        h = self.attention_norm(
+            x
+            + self.attention(
+                x,
+                input_pos,
+                kv_cache,
+                *args,
+                **kwds,
+            )
+        )
+        out = self.ffn_norm(h + self.feed_forward(h))
+        return out
+    def prefill(
+        self,
+        x: Tensor,
+        kv_cache: KVCacheProtocol,
+        attn_mask: Tensor,
+    ) -> Tensor:
+        h = self.attention_norm(
+            x
+            + self.attention.prefill(
+                x,
+                kv_cache,
+                attn_mask,
+            )
+        )
+        out = self.ffn_norm(h + self.feed_forward(h))
+        return out
+class TransformerDecoderABC(nn.Module, ABC):
+    def __init__(
+        self,
+        hidden_dim: int,
+        n_layer: int,
+        n_head: int,
+        ffn_dim: int,
+        vocab_size: int,
+        max_seq_length: int,
+        max_batch_size: int,
+    ) -> None:
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.n_head = n_head
+        assert hidden_dim % n_head == 0
+        self.head_dim = hidden_dim // n_head
+        self.vocab_size = vocab_size
+        self.n_layer = n_layer
+        self.layers: MutableSequence[TransformerBlockABC]
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+    def __call__(self, input_pos: Tensor, x: Tensor, kv_caches: MutableSequence[KVCacheProtocol], *args, **kwds):
+        for layer, kv_cache in zip(self.layers, kv_caches):
+            x = layer(x, input_pos, kv_cache, *args, **kwds)
+        return x
+    def prefill(self, x: Tensor, kv_caches: MutableSequence[KVCacheProtocol], attn_mask: Tensor):
+        for layer, kv_cache in zip(self.layers, kv_caches):
+            x = layer.prefill(x, kv_cache, attn_mask)
+        return x
+class T2SDecoderABC(nn.Module, ABC, T2SDecoderProtocol):
+    def __init__(
+        self,
+        config: dict,
+        max_seq_length: int = 2000,
+        max_batch_size: int = 10,
+    ) -> None:
+        super().__init__()
+        hidden_dim: int = config["model"]["hidden_dim"]
+        embedding_dim: int = config["model"]["embedding_dim"]
+        n_head: int = config["model"]["head"]
+        n_layer: int = config["model"]["n_layer"]
+        vocab_size: int = config["model"]["vocab_size"]
+        phoneme_vocab_size: int = config["model"]["phoneme_vocab_size"]
+        EOS: int = config["model"]["EOS"]
+        ffn_dim: int = hidden_dim * 4
+        self.n_layer = int(n_layer)
+        self.hidden_dim = int(hidden_dim)
+        self.n_head = int(n_head)
+        assert hidden_dim % n_head == 0
+        self.head_dim = int(hidden_dim // n_head)
+        self.embedding_dim = int(embedding_dim)
+        self.ffn_dim = int(ffn_dim)
+        self.vocab_size = int(vocab_size)
+        self.phoneme_vocab_size = int(phoneme_vocab_size)
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+        self.EOS = EOS
+        assert self.EOS == self.vocab_size - 1
+        self.bert_proj: nn.Linear
+        self.ar_predict_layer: nn.Linear
+        self.h: TransformerDecoderABC
+        self.kv_class: type[KVCacheABC]
+        self.GraphCache: CUDAGraphCacheABC | None
+        self.ar_text_embedding = TokenEmbedding(self.embedding_dim, self.phoneme_vocab_size)
+        self.ar_text_position = SinePositionalEmbedding(
+            self.embedding_dim,
+            scale=False,
+            alpha=True,
+            max_batch_size=max_batch_size,
+            max_seq_len=max_seq_length,
+        )
+        self.ar_audio_embedding = TokenEmbedding(self.embedding_dim, self.vocab_size)
+        self.ar_audio_position = SinePositionalEmbedding(
+            self.embedding_dim,
+            scale=False,
+            alpha=True,
+            max_batch_size=max_batch_size,
+            max_seq_len=max_seq_length,
+        )
+        self._register_load_state_dict_pre_hook(self.load_hook)
+    def load_hook(self, state_dict: dict[str, Tensor], prefix, *args):
+        model_keys = [key for key in state_dict if key.startswith("model.")]
+        for key in model_keys:
+            new_key = key[len("model.") :]
+            state_dict[new_key] = state_dict.pop(key)
+    def init_cache(self, bsz: int = 0) -> MutableSequence[KVCacheProtocol]:
+        bsz = bsz or self.h.max_batch_size
+        assert bsz <= self.h.max_batch_size
+        seq_lens = self.h.max_seq_length
+        dtype = self.bert_proj.bias.dtype
+        kvclass = self.kv_class
+        return nn.ModuleList(
+            [kvclass(bsz, seq_lens, self.n_head, self.head_dim) for _ in range(self.n_layer)],
+        ).to(self.device, dtype)  # type: ignore
+    def embed(
+        self,
+        x: list[torch.Tensor],
+        y: torch.Tensor,
+        bert_features: list[torch.Tensor],
+    ):
+        x_len: list[int] = [i.shape[0] for i in x]
+        x_len_max = max(x_len)
+        xy_pos = torch.zeros((len(x), x_len_max + y.shape[1], self.embedding_dim)).to(bert_features[0].dtype)
+        bert_features = list(map(lambda x: x.transpose(0, 1), bert_features))
+        y_len = y.shape[1]
+        y_emb = self.ar_audio_embedding(y)
+        y_pos = self.ar_audio_position.prefill(y_emb)
+        for bs, (x_, len_, bert_feature) in enumerate(zip(x, x_len, bert_features)):
+            x_emb = self.ar_text_embedding(x_)
+            bert = self.bert_proj(bert_feature)
+            x_emb = x_emb + bert
+            x_pos = self.ar_text_position.prefill(x_emb.unsqueeze(0))
+            xy_pos[[bs], :len_] = x_pos
+            xy_pos[[bs], len_ : len_ + y_len] = y_pos
+        return xy_pos
+    def compile(self, *args, **kwds):
+        # Experimental features to reduce compilation times, will be on by default in future
+        torch._inductor.config.triton.cudagraph_skip_dynamic_graphs = True
+        torch._inductor.config.coordinate_descent_tuning = True
+        torch._inductor.config.triton.unique_kernel_names = True
+        torch._inductor.config.fx_graph_cache = True
+        torch._inductor.config.triton.cudagraph_trees = True
+        torch._inductor.config.triton.cudagraph_support_input_mutation = True
+        self.h.compile(fullgraph=True, mode="reduce-overhead")
+    def capture(
+        self, input_pos: Tensor, x: Tensor, x_dec: Tensor, kv_caches: MutableSequence[KVCacheProtocol], *args, **kwds
+    ) -> CUDAGraph:
+        assert torch.cuda.is_available()
+        s = torch.cuda.Stream()
+        s.wait_stream(torch.cuda.current_stream())
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.stream(s):
+            for _ in range(5):
+                self.h(input_pos, x, kv_caches, *args, **kwds)
+        torch.cuda.current_stream().wait_stream(s)
+        with torch.cuda.graph(graph):
+            x_dec.copy_(self.h(input_pos, x, kv_caches, *args, **kwds))
+        torch.cuda.synchronize()
+        return graph
+    @abstractmethod
+    def pre_forward(self, session: T2SSession) -> tuple[list[Tensor], dict[str, Tensor]]:
+        return list(), dict()
+    @abstractmethod
+    def post_forward(self, idx: int, session: T2SSession) -> None:
+        return
+class CUDAGraphCacheABC(ABC):
+    def __init__(
+        self,
+        decoder: T2SDecoderABC,
+    ) -> None:
+        self.is_applicable: bool
+        if torch.cuda.is_available() and self.is_applicable:
+            self.device: torch.device = decoder.device
+            self.dtype = decoder.bert_proj.bias.dtype
+            self.assigned: bool = False
+            self.decoder: T2SDecoderABC = decoder
+            self.kv_cache: MutableSequence[KVCacheProtocol] = decoder.init_cache(decoder.max_batch_size)
+            self.xy_pos = torch.rand(size=(decoder.max_batch_size, 1, decoder.embedding_dim), device=self.device).to(
+                self.dtype
+            )
+            self.xy_dec = self.xy_pos.clone()
+            self.input_pos = torch.tensor([10] * decoder.max_batch_size, device=self.device).int()
+            self.graph: torch.cuda.CUDAGraph | None = None
+            self.stream: torch.cuda.Stream | None
+            self.id: int = random.randint(1, 2**32 - 1)
+    def assign_graph(self, session: T2SSession):
+        if self.graph is None:
+            args, kwds = self.decoder.pre_forward(session)
+            graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
+            self.graph = graph
+            self.stream = torch.cuda.Stream()
+        if self.assigned is False:
+            self.get_cache_graph(session)
+            session.id = self.id
+            self.assigned = True
+        else:
+            self.capture_new_graph(session)
+    @abstractmethod
+    def release_graph(self, session: T2SSession): ...
+    @abstractmethod
+    def get_cache_graph(self, session: T2SSession):
+        pass
+    @abstractmethod
+    def capture_new_graph(self, session: T2SSession):
+        pass
+class TorchProfiler:
+    def __init__(self, debug: bool, log_dir: str = "./profiler") -> None:
+        self.debug = debug
+        self.log_dir = log_dir
+        self.__profiler: torch.profiler.profile
+        if self.debug and not os.path.exists(self.log_dir):
+            os.makedirs(self.log_dir)
+        self.tensorboard_handler = tensorboard_trace_handler(self.log_dir)
+    def profiler_callback(self, prof: torch.profiler.profile):
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=30))
+        print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=30))
+        self.tensorboard_handler(prof)
+    @staticmethod
+    def three_step_schedule(step: int) -> ProfilerAction:
+        if step == 0:
+            return ProfilerAction.NONE
+        elif step == 1:
+            return ProfilerAction.RECORD
+        elif step == 2:
+            return ProfilerAction.RECORD_AND_SAVE
+        else:
+            return ProfilerAction.NONE
+    def start(self):
+        if not self.debug:
+            return
+        assert self.__profiler is not None
+        self.__profiler.step()
+    def end(self):
+        if not self.debug:
+            return
+        assert self.__profiler is not None
+        self.__profiler.step()
+    def profiler(self):
+        if self.debug:
+            activities_list = [torch.profiler.ProfilerActivity.CPU]
+            if torch.cuda.is_available():
+                activities_list.append(torch.profiler.ProfilerActivity.CUDA)
+            self.__profiler = torch.profiler.profile(
+                activities=activities_list,
+                record_shapes=True,
+                with_stack=True,
+                with_modules=True,
+                profile_memory=True,
+                schedule=self.three_step_schedule,
+                on_trace_ready=self.profiler_callback,
+            )
+            return self.__profiler
+        else:
+            return nullcontext()
+    def record(self, func_name: str):
+        if self.debug:
+            return torch.profiler.record_function(func_name)
+        else:
+            return nullcontext()

GPT_SoVITS/Accelerate/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from . import MLX, PyTorch
+from .logger import console, logger, tb
+from .PyTorch import T2SEngineTorch, T2SRequest, T2SResult
+from .PyTorch.structs import T2SEngineProtocol
+backends = PyTorch.backends + MLX.backends
+backends = [
+    b.replace("_", "-")
+    .title()
+    .replace("Mlx", "MLX")
+    .replace("Mps", "MPS")
+    .replace("Cuda", "CUDA")
+    .replace("Mxfp4", "MXFP4")
+    for b in backends
+]
+__all__ = [
+    "T2SEngineTorch",
+    "T2SRequest",
+    "T2SResult",
+    "backends",
+    "MLX",
+    "PyTorch",
+    "logger",
+    "console",
+    "tb",
+    "T2SEngineProtocol",
+]

GPT_SoVITS/Accelerate/logger.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import sys
+from typing import Optional
+from loguru import logger
+from rich.console import Console, JustifyMethod
+from rich.highlighter import Highlighter
+from rich.logging import RichHandler
+from rich.progress import Task, TextColumn
+from rich.style import StyleType
+from rich.table import Column
+from rich.text import Text
+from rich.traceback import Traceback, install
+console = Console(stderr=False)
+install(console=console)
+def loguru_format(record):
+    level = record["level"].name
+    color = {
+        "DEBUG": "green",
+        "INFO": "blue",
+        "WARNING": "yellow",
+        "ERROR": "red",
+        "CRITICAL": "bright_red",
+    }.get(level, "white")
+    return f"[bold {color}][{level}][/bold {color}] " + "{message}"
+handler_with_locals = RichHandler(
+    console=console,
+    show_time=False,
+    show_path=False,
+    rich_tracebacks=True,
+    tracebacks_show_locals=True,
+    show_level=False,
+    markup=True,
+)
+handler_without_locals = RichHandler(
+    console=console,
+    show_time=False,
+    show_path=False,
+    rich_tracebacks=True,
+    tracebacks_show_locals=False,
+    show_level=False,
+    markup=True,
+)
+def local_filter(r):
+    return r["extra"].get("show_locals", True)
+logger.remove()
+logger.add(handler_with_locals, format=loguru_format, filter=local_filter)
+logger.add(handler_without_locals, format=loguru_format, filter=lambda x: not local_filter(x))
+class SpeedColumnToken(TextColumn):
+    """Show task progress as a percentage.
+    Args:
+        text_format (str, optional): Format for percentage display. Defaults to "[progress.percentage]{task.percentage:>3.0f}%".
+        text_format_no_percentage (str, optional): Format if percentage is unknown. Defaults to "".
+        style (StyleType, optional): Style of output. Defaults to "none".
+        justify (JustifyMethod, optional): Text justification. Defaults to "left".
+        markup (bool, optional): Enable markup. Defaults to True.
+        highlighter (Optional[Highlighter], optional): Highlighter to apply to output. Defaults to None.
+        table_column (Optional[Column], optional): Table Column to use. Defaults to None.
+        show_speed (bool, optional): Show speed if total is unknown. Defaults to False.
+    """
+    def __init__(
+        self,
+        text_format: str = "[progress.percentage]{task.percentage:>3.0f}%",
+        text_format_no_percentage: str = "",
+        style: StyleType = "none",
+        justify: JustifyMethod = "left",
+        markup: bool = True,
+        highlighter: Optional[Highlighter] = None,
+        table_column: Optional[Column] = None,
+        show_speed: bool = True,
+    ) -> None:
+        self.text_format_no_percentage = text_format_no_percentage
+        self.show_speed = show_speed
+        super().__init__(
+            text_format=text_format,
+            style=style,
+            justify=justify,
+            markup=markup,
+            highlighter=highlighter,
+            table_column=table_column,
+        )
+    @classmethod
+    def render_speed(cls, speed: Optional[float]) -> Text:
+        """Render the speed in iterations per second.
+        Args:
+            task (Task): A Task object.
+        Returns:
+            Text: Text object containing the task speed.
+        """
+        if speed is None:
+            return Text("", style="progress.percentage")
+        return Text(f"{speed:.1f} token/s", style="progress.percentage")
+    def render(self, task: Task) -> Text:
+        if self.show_speed:
+            return self.render_speed(task.finished_speed or task.speed)
+        text_format = self.text_format_no_percentage if task.total is None else self.text_format
+        _text = text_format.format(task=task)
+        if self.markup:
+            text = Text.from_markup(_text, style=self.style, justify=self.justify)
+        else:
+            text = Text(_text, style=self.style, justify=self.justify)
+        if self.highlighter:
+            self.highlighter.highlight(text)
+        return text
+class SpeedColumnIteration(TextColumn):
+    """Show task progress as a percentage.
+    Args:
+        text_format (str, optional): Format for percentage display. Defaults to "[progress.percentage]{task.percentage:>3.0f}%".
+        text_format_no_percentage (str, optional): Format if percentage is unknown. Defaults to "".
+        style (StyleType, optional): Style of output. Defaults to "none".
+        justify (JustifyMethod, optional): Text justification. Defaults to "left".
+        markup (bool, optional): Enable markup. Defaults to True.
+        highlighter (Optional[Highlighter], optional): Highlighter to apply to output. Defaults to None.
+        table_column (Optional[Column], optional): Table Column to use. Defaults to None.
+        show_speed (bool, optional): Show speed if total is unknown. Defaults to False.
+    """
+    def __init__(
+        self,
+        text_format: str = "[progress.percentage]{task.percentage:>3.0f}%",
+        text_format_no_percentage: str = "",
+        style: StyleType = "none",
+        justify: JustifyMethod = "left",
+        markup: bool = True,
+        highlighter: Optional[Highlighter] = None,
+        table_column: Optional[Column] = None,
+        show_speed: bool = True,
+    ) -> None:
+        self.text_format_no_percentage = text_format_no_percentage
+        self.show_speed = show_speed
+        super().__init__(
+            text_format=text_format,
+            style=style,
+            justify=justify,
+            markup=markup,
+            highlighter=highlighter,
+            table_column=table_column,
+        )
+    @classmethod
+    def render_speed(cls, speed: Optional[float]) -> Text:
+        """Render the speed in iterations per second.
+        Args:
+            task (Task): A Task object.
+        Returns:
+            Text: Text object containing the task speed.
+        """
+        if speed is None:
+            return Text("", style="progress.percentage")
+        return Text(f"{speed:.1f} it/s", style="progress.percentage")
+    def render(self, task: Task) -> Text:
+        if self.show_speed:
+            return self.render_speed(task.finished_speed or task.speed)
+        text_format = self.text_format_no_percentage if task.total is None else self.text_format
+        _text = text_format.format(task=task)
+        if self.markup:
+            text = Text.from_markup(_text, style=self.style, justify=self.justify)
+        else:
+            text = Text(_text, style=self.style, justify=self.justify)
+        if self.highlighter:
+            self.highlighter.highlight(text)
+        return text
+def tb(show_locals: bool = True):
+    exc_type, exc_value, exc_tb = sys.exc_info()
+    assert exc_type
+    assert exc_value
+    tb = Traceback.from_exception(exc_type, exc_value, exc_tb, show_locals=show_locals)
+    return tb
+__all__ = ["logger", "console", "tb", "SpeedColumnToken", "SpeedColumnIteration"]
+if __name__ == "__main__":
+    try:
+        raise RuntimeError()
+    except Exception:
+        logger.bind(show_locals=False).exception("TEST")

GPT_SoVITS/configs/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.yaml

GPT_SoVITS/configs/s2.json ADDED Viewed

	@@ -0,0 +1,91 @@

+{
+  "train": {
+    "log_interval": 100,
+    "eval_interval": 500,
+    "seed": 1234,
+    "epochs": 100,
+    "learning_rate": 0.0001,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 32,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 20480,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "text_low_lr_rate": 0.4,
+    "grad_ckpt": false
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 32000,
+    "filter_length": 2048,
+    "hop_length": 640,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 300,
+    "cleaned_text": true
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      10,
+      8,
+      2,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      8,
+      2,
+      2
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 512,
+    "semantic_frame_rate": "25hz",
+    "freeze_quantizer": true
+  },
+  "s2_ckpt_dir": "logs/s2/big2k1",
+  "content_module": "cnhubert"
+}

GPT_SoVITS/configs/s2v2Pro.json ADDED Viewed

	@@ -0,0 +1,91 @@

+{
+  "train": {
+    "log_interval": 100,
+    "eval_interval": 500,
+    "seed": 1234,
+    "epochs": 100,
+    "learning_rate": 0.0001,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 32,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 20480,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "text_low_lr_rate": 0.4,
+    "grad_ckpt": false
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 32000,
+    "filter_length": 2048,
+    "hop_length": 640,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 300,
+    "cleaned_text": true
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      10,
+      8,
+      2,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      8,
+      2,
+      2
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 1024,
+    "semantic_frame_rate": "25hz",
+    "freeze_quantizer": true
+  },
+  "s2_ckpt_dir": "logs/s2/big2k1",
+  "content_module": "cnhubert"
+}

GPT_SoVITS/configs/s2v2ProPlus.json ADDED Viewed

	@@ -0,0 +1,91 @@

+{
+  "train": {
+    "log_interval": 100,
+    "eval_interval": 500,
+    "seed": 1234,
+    "epochs": 100,
+    "learning_rate": 0.0001,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 32,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 20480,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "text_low_lr_rate": 0.4,
+    "grad_ckpt": false
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 32000,
+    "filter_length": 2048,
+    "hop_length": 640,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 300,
+    "cleaned_text": true
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      10,
+      8,
+      2,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 768,
+    "upsample_kernel_sizes": [
+      20,
+      16,
+      8,
+      2,
+      2
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 1024,
+    "semantic_frame_rate": "25hz",
+    "freeze_quantizer": true
+  },
+  "s2_ckpt_dir": "logs/s2/big2k1",
+  "content_module": "cnhubert"
+}

GPT_SoVITS/eres2net/ERes2NetV2.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+To further improve the short-duration feature extraction capability of ERes2Net, we expand the channel dimension
+within each stage. However, this modification also increases the number of model parameters and computational complexity.
+To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures, ultimately reducing
+both the model parameters and its computational cost.
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from . import pooling_layers as pooling_layers
+from .fusion import AFF
+class ReLU(nn.Hardtanh):
+    def __init__(self, inplace=False):
+        super(ReLU, self).__init__(0, 20, inplace)
+    def __repr__(self):
+        inplace_str = "inplace" if self.inplace else ""
+        return self.__class__.__name__ + " (" + inplace_str + ")"
+class BasicBlockERes2NetV2(nn.Module):
+    def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
+        super(BasicBlockERes2NetV2, self).__init__()
+        width = int(math.floor(planes * (baseWidth / 64.0)))
+        self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
+        self.bn1 = nn.BatchNorm2d(width * scale)
+        self.nums = scale
+        self.expansion = expansion
+        convs = []
+        bns = []
+        for i in range(self.nums):
+            convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
+            bns.append(nn.BatchNorm2d(width))
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        self.relu = ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion * planes),
+            )
+        self.stride = stride
+        self.width = width
+        self.scale = scale
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        spx = torch.split(out, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            sp = self.convs[i](sp)
+            sp = self.relu(self.bns[i](sp))
+            if i == 0:
+                out = sp
+            else:
+                out = torch.cat((out, sp), 1)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        residual = self.shortcut(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class BasicBlockERes2NetV2AFF(nn.Module):
+    def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
+        super(BasicBlockERes2NetV2AFF, self).__init__()
+        width = int(math.floor(planes * (baseWidth / 64.0)))
+        self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
+        self.bn1 = nn.BatchNorm2d(width * scale)
+        self.nums = scale
+        self.expansion = expansion
+        convs = []
+        fuse_models = []
+        bns = []
+        for i in range(self.nums):
+            convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
+            bns.append(nn.BatchNorm2d(width))
+        for j in range(self.nums - 1):
+            fuse_models.append(AFF(channels=width, r=4))
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        self.fuse_models = nn.ModuleList(fuse_models)
+        self.relu = ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion * planes),
+            )
+        self.stride = stride
+        self.width = width
+        self.scale = scale
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        spx = torch.split(out, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = self.fuse_models[i - 1](sp, spx[i])
+            sp = self.convs[i](sp)
+            sp = self.relu(self.bns[i](sp))
+            if i == 0:
+                out = sp
+            else:
+                out = torch.cat((out, sp), 1)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        residual = self.shortcut(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class ERes2NetV2(nn.Module):
+    def __init__(
+        self,
+        block=BasicBlockERes2NetV2,
+        block_fuse=BasicBlockERes2NetV2AFF,
+        num_blocks=[3, 4, 6, 3],
+        m_channels=64,
+        feat_dim=80,
+        embedding_size=192,
+        baseWidth=26,
+        scale=2,
+        expansion=2,
+        pooling_func="TSTP",
+        two_emb_layer=False,
+    ):
+        super(ERes2NetV2, self).__init__()
+        self.in_planes = m_channels
+        self.feat_dim = feat_dim
+        self.embedding_size = embedding_size
+        self.stats_dim = int(feat_dim / 8) * m_channels * 8
+        self.two_emb_layer = two_emb_layer
+        self.baseWidth = baseWidth
+        self.scale = scale
+        self.expansion = expansion
+        self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(m_channels)
+        self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
+        # Downsampling module
+        self.layer3_ds = nn.Conv2d(
+            m_channels * 4 * self.expansion,
+            m_channels * 8 * self.expansion,
+            kernel_size=3,
+            padding=1,
+            stride=2,
+            bias=False,
+        )
+        # Bottom-up fusion module
+        self.fuse34 = AFF(channels=m_channels * 8 * self.expansion, r=4)
+        self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2
+        self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * self.expansion)
+        self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, embedding_size)
+        if self.two_emb_layer:
+            self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
+            self.seg_2 = nn.Linear(embedding_size, embedding_size)
+        else:
+            self.seg_bn_1 = nn.Identity()
+            self.seg_2 = nn.Identity()
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(
+                block(
+                    self.in_planes, planes, stride, baseWidth=self.baseWidth, scale=self.scale, expansion=self.expansion
+                )
+            )
+            self.in_planes = planes * self.expansion
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
+        x = x.unsqueeze_(1)
+        out = F.relu(self.bn1(self.conv1(x)))
+        out1 = self.layer1(out)
+        out2 = self.layer2(out1)
+        out3 = self.layer3(out2)
+        out4 = self.layer4(out3)
+        out3_ds = self.layer3_ds(out3)
+        fuse_out34 = self.fuse34(out4, out3_ds)
+        stats = self.pool(fuse_out34)
+        embed_a = self.seg_1(stats)
+        if self.two_emb_layer:
+            out = F.relu(embed_a)
+            out = self.seg_bn_1(out)
+            embed_b = self.seg_2(out)
+            return embed_b
+        else:
+            return embed_a
+    def forward3(self, x):
+        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
+        x = x.unsqueeze_(1)
+        out = F.relu(self.bn1(self.conv1(x)))
+        out1 = self.layer1(out)
+        out2 = self.layer2(out1)
+        out3 = self.layer3(out2)
+        out4 = self.layer4(out3)
+        out3_ds = self.layer3_ds(out3)
+        fuse_out34 = self.fuse34(out4, out3_ds)
+        return fuse_out34.flatten(start_dim=1, end_dim=2).mean(-1)

GPT_SoVITS/eres2net/fusion.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+import torch
+import torch.nn as nn
+class AFF(nn.Module):
+    def __init__(self, channels=64, r=4):
+        super(AFF, self).__init__()
+        inter_channels = int(channels // r)
+        self.local_att = nn.Sequential(
+            nn.Conv2d(channels * 2, inter_channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(inter_channels),
+            nn.SiLU(inplace=True),
+            nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(channels),
+        )
+    def forward(self, x, ds_y):
+        xa = torch.cat((x, ds_y), dim=1)
+        x_att = self.local_att(xa)
+        x_att = 1.0 + torch.tanh(x_att)
+        xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0 - x_att)
+        return xo

GPT_SoVITS/eres2net/kaldi.py ADDED Viewed

	@@ -0,0 +1,844 @@

+import math
+from typing import Tuple
+import torch
+import torchaudio
+from torch import Tensor
+__all__ = [
+    "get_mel_banks",
+    "inverse_mel_scale",
+    "inverse_mel_scale_scalar",
+    "mel_scale",
+    "mel_scale_scalar",
+    "spectrogram",
+    "fbank",
+    "mfcc",
+    "vtln_warp_freq",
+    "vtln_warp_mel_freq",
+]
+# numeric_limits<float>::epsilon() 1.1920928955078125e-07
+EPSILON = torch.tensor(torch.finfo(torch.float).eps)
+# 1 milliseconds = 0.001 seconds
+MILLISECONDS_TO_SECONDS = 0.001
+# window types
+HAMMING = "hamming"
+HANNING = "hanning"
+POVEY = "povey"
+RECTANGULAR = "rectangular"
+BLACKMAN = "blackman"
+WINDOWS = [HAMMING, HANNING, POVEY, RECTANGULAR, BLACKMAN]
+def _get_epsilon(device, dtype):
+    return EPSILON.to(device=device, dtype=dtype)
+def _next_power_of_2(x: int) -> int:
+    r"""Returns the smallest power of 2 that is greater than x"""
+    return 1 if x == 0 else 2 ** (x - 1).bit_length()
+def _get_strided(waveform: Tensor, window_size: int, window_shift: int, snip_edges: bool) -> Tensor:
+    r"""Given a waveform (1D tensor of size ``num_samples``), it returns a 2D tensor (m, ``window_size``)
+    representing how the window is shifted along the waveform. Each row is a frame.
+    Args:
+        waveform (Tensor): Tensor of size ``num_samples``
+        window_size (int): Frame length
+        window_shift (int): Frame shift
+        snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends.
+    Returns:
+        Tensor: 2D tensor of size (m, ``window_size``) where each row is a frame
+    """
+    assert waveform.dim() == 1
+    num_samples = waveform.size(0)
+    strides = (window_shift * waveform.stride(0), waveform.stride(0))
+    if snip_edges:
+        if num_samples < window_size:
+            return torch.empty((0, 0), dtype=waveform.dtype, device=waveform.device)
+        else:
+            m = 1 + (num_samples - window_size) // window_shift
+    else:
+        reversed_waveform = torch.flip(waveform, [0])
+        m = (num_samples + (window_shift // 2)) // window_shift
+        pad = window_size // 2 - window_shift // 2
+        pad_right = reversed_waveform
+        if pad > 0:
+            # torch.nn.functional.pad returns [2,1,0,1,2] for 'reflect'
+            # but we want [2, 1, 0, 0, 1, 2]
+            pad_left = reversed_waveform[-pad:]
+            waveform = torch.cat((pad_left, waveform, pad_right), dim=0)
+        else:
+            # pad is negative so we want to trim the waveform at the front
+            waveform = torch.cat((waveform[-pad:], pad_right), dim=0)
+    sizes = (m, window_size)
+    return waveform.as_strided(sizes, strides)
+def _feature_window_function(
+    window_type: str,
+    window_size: int,
+    blackman_coeff: float,
+    device: torch.device,
+    dtype: int,
+) -> Tensor:
+    r"""Returns a window function with the given type and size"""
+    if window_type == HANNING:
+        return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype)
+    elif window_type == HAMMING:
+        return torch.hamming_window(window_size, periodic=False, alpha=0.54, beta=0.46, device=device, dtype=dtype)
+    elif window_type == POVEY:
+        # like hanning but goes to zero at edges
+        return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype).pow(0.85)
+    elif window_type == RECTANGULAR:
+        return torch.ones(window_size, device=device, dtype=dtype)
+    elif window_type == BLACKMAN:
+        a = 2 * math.pi / (window_size - 1)
+        window_function = torch.arange(window_size, device=device, dtype=dtype)
+        # can't use torch.blackman_window as they use different coefficients
+        return (
+            blackman_coeff
+            - 0.5 * torch.cos(a * window_function)
+            + (0.5 - blackman_coeff) * torch.cos(2 * a * window_function)
+        ).to(device=device, dtype=dtype)
+    else:
+        raise Exception("Invalid window type " + window_type)
+def _get_log_energy(strided_input: Tensor, epsilon: Tensor, energy_floor: float) -> Tensor:
+    r"""Returns the log energy of size (m) for a strided_input (m,*)"""
+    device, dtype = strided_input.device, strided_input.dtype
+    log_energy = torch.max(strided_input.pow(2).sum(1), epsilon).log()  # size (m)
+    if energy_floor == 0.0:
+        return log_energy
+    return torch.max(log_energy, torch.tensor(math.log(energy_floor), device=device, dtype=dtype))
+def _get_waveform_and_window_properties(
+    waveform: Tensor,
+    channel: int,
+    sample_frequency: float,
+    frame_shift: float,
+    frame_length: float,
+    round_to_power_of_two: bool,
+    preemphasis_coefficient: float,
+) -> Tuple[Tensor, int, int, int]:
+    r"""Gets the waveform and window properties"""
+    channel = max(channel, 0)
+    assert channel < waveform.size(0), "Invalid channel {} for size {}".format(channel, waveform.size(0))
+    waveform = waveform[channel, :]  # size (n)
+    window_shift = int(sample_frequency * frame_shift * MILLISECONDS_TO_SECONDS)
+    window_size = int(sample_frequency * frame_length * MILLISECONDS_TO_SECONDS)
+    padded_window_size = _next_power_of_2(window_size) if round_to_power_of_two else window_size
+    assert 2 <= window_size <= len(waveform), "choose a window size {} that is [2, {}]".format(
+        window_size, len(waveform)
+    )
+    assert 0 < window_shift, "`window_shift` must be greater than 0"
+    assert padded_window_size % 2 == 0, (
+        "the padded `window_size` must be divisible by two. use `round_to_power_of_two` or change `frame_length`"
+    )
+    assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]"
+    assert sample_frequency > 0, "`sample_frequency` must be greater than zero"
+    return waveform, window_shift, window_size, padded_window_size
+def _get_window(
+    waveform: Tensor,
+    padded_window_size: int,
+    window_size: int,
+    window_shift: int,
+    window_type: str,
+    blackman_coeff: float,
+    snip_edges: bool,
+    raw_energy: bool,
+    energy_floor: float,
+    dither: float,
+    remove_dc_offset: bool,
+    preemphasis_coefficient: float,
+) -> Tuple[Tensor, Tensor]:
+    r"""Gets a window and its log energy
+    Returns:
+        (Tensor, Tensor): strided_input of size (m, ``padded_window_size``) and signal_log_energy of size (m)
+    """
+    device, dtype = waveform.device, waveform.dtype
+    epsilon = _get_epsilon(device, dtype)
+    # size (m, window_size)
+    strided_input = _get_strided(waveform, window_size, window_shift, snip_edges)
+    if dither != 0.0:
+        rand_gauss = torch.randn(strided_input.shape, device=device, dtype=dtype)
+        strided_input = strided_input + rand_gauss * dither
+    if remove_dc_offset:
+        # Subtract each row/frame by its mean
+        row_means = torch.mean(strided_input, dim=1).unsqueeze(1)  # size (m, 1)
+        strided_input = strided_input - row_means
+    if raw_energy:
+        # Compute the log energy of each row/frame before applying preemphasis and
+        # window function
+        signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor)  # size (m)
+    if preemphasis_coefficient != 0.0:
+        # strided_input[i,j] -= preemphasis_coefficient * strided_input[i, max(0, j-1)] for all i,j
+        offset_strided_input = torch.nn.functional.pad(strided_input.unsqueeze(0), (1, 0), mode="replicate").squeeze(
+            0
+        )  # size (m, window_size + 1)
+        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :-1]
+    # Apply window_function to each row/frame
+    window_function = _feature_window_function(window_type, window_size, blackman_coeff, device, dtype).unsqueeze(
+        0
+    )  # size (1, window_size)
+    strided_input = strided_input * window_function  # size (m, window_size)
+    # Pad columns with zero until we reach size (m, padded_window_size)
+    if padded_window_size != window_size:
+        padding_right = padded_window_size - window_size
+        strided_input = torch.nn.functional.pad(
+            strided_input.unsqueeze(0), (0, padding_right), mode="constant", value=0
+        ).squeeze(0)
+    # Compute energy after window function (not the raw one)
+    if not raw_energy:
+        signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor)  # size (m)
+    return strided_input, signal_log_energy
+def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
+    # subtracts the column mean of the tensor size (m, n) if subtract_mean=True
+    # it returns size (m, n)
+    if subtract_mean:
+        col_means = torch.mean(tensor, dim=0).unsqueeze(0)
+        tensor = tensor - col_means
+    return tensor
+def spectrogram(
+    waveform: Tensor,
+    blackman_coeff: float = 0.42,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    min_duration: float = 0.0,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    window_type: str = POVEY,
+) -> Tensor:
+    r"""Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's
+    compute-spectrogram-feats.
+    Args:
+        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
+        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
+        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
+        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
+            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
+        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
+            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
+            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
+        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
+        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
+        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
+        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
+        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
+        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. (Default: ``True``)
+        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
+            specified there) (Default: ``16000.0``)
+        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
+        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
+            it this way.  (Default: ``False``)
+        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
+         (Default: ``'povey'``)
+    Returns:
+        Tensor: A spectrogram identical to what Kaldi would output. The shape is
+        (m, ``padded_window_size // 2 + 1``) where m is calculated in _get_strided
+    """
+    device, dtype = waveform.device, waveform.dtype
+    epsilon = _get_epsilon(device, dtype)
+    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
+        waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
+    )
+    if len(waveform) < min_duration * sample_frequency:
+        # signal is too short
+        return torch.empty(0)
+    strided_input, signal_log_energy = _get_window(
+        waveform,
+        padded_window_size,
+        window_size,
+        window_shift,
+        window_type,
+        blackman_coeff,
+        snip_edges,
+        raw_energy,
+        energy_floor,
+        dither,
+        remove_dc_offset,
+        preemphasis_coefficient,
+    )
+    # size (m, padded_window_size // 2 + 1, 2)
+    fft = torch.fft.rfft(strided_input)
+    # Convert the FFT into a power spectrum
+    power_spectrum = torch.max(fft.abs().pow(2.0), epsilon).log()  # size (m, padded_window_size // 2 + 1)
+    power_spectrum[:, 0] = signal_log_energy
+    power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
+    return power_spectrum
+def inverse_mel_scale_scalar(mel_freq: float) -> float:
+    return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
+def inverse_mel_scale(mel_freq: Tensor) -> Tensor:
+    return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
+def mel_scale_scalar(freq: float) -> float:
+    return 1127.0 * math.log(1.0 + freq / 700.0)
+def mel_scale(freq: Tensor) -> Tensor:
+    return 1127.0 * (1.0 + freq / 700.0).log()
+def vtln_warp_freq(
+    vtln_low_cutoff: float,
+    vtln_high_cutoff: float,
+    low_freq: float,
+    high_freq: float,
+    vtln_warp_factor: float,
+    freq: Tensor,
+) -> Tensor:
+    r"""This computes a VTLN warping function that is not the same as HTK's one,
+    but has similar inputs (this function has the advantage of never producing
+    empty bins).
+    This function computes a warp function F(freq), defined between low_freq
+    and high_freq inclusive, with the following properties:
+        F(low_freq) == low_freq
+        F(high_freq) == high_freq
+    The function is continuous and piecewise linear with two inflection
+        points.
+    The lower inflection point (measured in terms of the unwarped
+        frequency) is at frequency l, determined as described below.
+    The higher inflection point is at a frequency h, determined as
+        described below.
+    If l <= f <= h, then F(f) = f/vtln_warp_factor.
+    If the higher inflection point (measured in terms of the unwarped
+        frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
+        Since (by the last point) F(h) == h/vtln_warp_factor, then
+        max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
+        h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
+          = vtln_high_cutoff * min(1, vtln_warp_factor).
+    If the lower inflection point (measured in terms of the unwarped
+        frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
+        This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
+                            = vtln_low_cutoff * max(1, vtln_warp_factor)
+    Args:
+        vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
+        vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
+        low_freq (float): Lower frequency cutoffs in mel computation
+        high_freq (float): Upper frequency cutoffs in mel computation
+        vtln_warp_factor (float): Vtln warp factor
+        freq (Tensor): given frequency in Hz
+    Returns:
+        Tensor: Freq after vtln warp
+    """
+    assert vtln_low_cutoff > low_freq, "be sure to set the vtln_low option higher than low_freq"
+    assert vtln_high_cutoff < high_freq, "be sure to set the vtln_high option lower than high_freq [or negative]"
+    l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
+    h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
+    scale = 1.0 / vtln_warp_factor
+    Fl = scale * l  # F(l)
+    Fh = scale * h  # F(h)
+    assert l > low_freq and h < high_freq
+    # slope of left part of the 3-piece linear function
+    scale_left = (Fl - low_freq) / (l - low_freq)
+    # [slope of center part is just "scale"]
+    # slope of right part of the 3-piece linear function
+    scale_right = (high_freq - Fh) / (high_freq - h)
+    res = torch.empty_like(freq)
+    outside_low_high_freq = torch.lt(freq, low_freq) | torch.gt(freq, high_freq)  # freq < low_freq || freq > high_freq
+    before_l = torch.lt(freq, l)  # freq < l
+    before_h = torch.lt(freq, h)  # freq < h
+    after_h = torch.ge(freq, h)  # freq >= h
+    # order of operations matter here (since there is overlapping frequency regions)
+    res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
+    res[before_h] = scale * freq[before_h]
+    res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
+    res[outside_low_high_freq] = freq[outside_low_high_freq]
+    return res
+def vtln_warp_mel_freq(
+    vtln_low_cutoff: float,
+    vtln_high_cutoff: float,
+    low_freq,
+    high_freq: float,
+    vtln_warp_factor: float,
+    mel_freq: Tensor,
+) -> Tensor:
+    r"""
+    Args:
+        vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
+        vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
+        low_freq (float): Lower frequency cutoffs in mel computation
+        high_freq (float): Upper frequency cutoffs in mel computation
+        vtln_warp_factor (float): Vtln warp factor
+        mel_freq (Tensor): Given frequency in Mel
+    Returns:
+        Tensor: ``mel_freq`` after vtln warp
+    """
+    return mel_scale(
+        vtln_warp_freq(
+            vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, vtln_warp_factor, inverse_mel_scale(mel_freq)
+        )
+    )
+def get_mel_banks(
+    num_bins: int,
+    window_length_padded: int,
+    sample_freq: float,
+    low_freq: float,
+    high_freq: float,
+    vtln_low: float,
+    vtln_high: float,
+    vtln_warp_factor: float,
+    device=None,
+    dtype=None,
+) -> Tuple[Tensor, Tensor]:
+    """
+    Returns:
+        (Tensor, Tensor): The tuple consists of ``bins`` (which is
+        melbank of size (``num_bins``, ``num_fft_bins``)) and ``center_freqs`` (which is
+        center frequencies of bins of size (``num_bins``)).
+    """
+    assert num_bins > 3, "Must have at least 3 mel bins"
+    assert window_length_padded % 2 == 0
+    num_fft_bins = window_length_padded / 2
+    nyquist = 0.5 * sample_freq
+    if high_freq <= 0.0:
+        high_freq += nyquist
+    assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), (
+        "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
+    )
+    # fft-bin width [think of it as Nyquist-freq / half-window-length]
+    fft_bin_width = sample_freq / window_length_padded
+    mel_low_freq = mel_scale_scalar(low_freq)
+    mel_high_freq = mel_scale_scalar(high_freq)
+    # divide by num_bins+1 in next line because of end-effects where the bins
+    # spread out to the sides.
+    mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
+    if vtln_high < 0.0:
+        vtln_high += nyquist
+    assert vtln_warp_factor == 1.0 or (
+        (low_freq < vtln_low < high_freq) and (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)
+    ), "Bad values in options: vtln-low {} and vtln-high {}, versus low-freq {} and high-freq {}".format(
+        vtln_low, vtln_high, low_freq, high_freq
+    )
+    bin = torch.arange(num_bins).unsqueeze(1)
+    left_mel = mel_low_freq + bin * mel_freq_delta  # size(num_bins, 1)
+    center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta  # size(num_bins, 1)
+    right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta  # size(num_bins, 1)
+    if vtln_warp_factor != 1.0:
+        left_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, left_mel)
+        center_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, center_mel)
+        right_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, right_mel)
+    # center_freqs = inverse_mel_scale(center_mel)  # size (num_bins)
+    # size(1, num_fft_bins)
+    mel = mel_scale(fft_bin_width * torch.arange(num_fft_bins)).unsqueeze(0)
+    # size (num_bins, num_fft_bins)
+    up_slope = (mel - left_mel) / (center_mel - left_mel)
+    down_slope = (right_mel - mel) / (right_mel - center_mel)
+    if vtln_warp_factor == 1.0:
+        # left_mel < center_mel < right_mel so we can min the two slopes and clamp negative values
+        bins = torch.max(torch.zeros(1), torch.min(up_slope, down_slope))
+    else:
+        # warping can move the order of left_mel, center_mel, right_mel anywhere
+        bins = torch.zeros_like(up_slope)
+        up_idx = torch.gt(mel, left_mel) & torch.le(mel, center_mel)  # left_mel < mel <= center_mel
+        down_idx = torch.gt(mel, center_mel) & torch.lt(mel, right_mel)  # center_mel < mel < right_mel
+        bins[up_idx] = up_slope[up_idx]
+        bins[down_idx] = down_slope[down_idx]
+    return bins.to(device=device, dtype=dtype)  # , center_freqs
+cache = {}
+def fbank(
+    waveform: Tensor,
+    blackman_coeff: float = 0.42,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    high_freq: float = 0.0,
+    htk_compat: bool = False,
+    low_freq: float = 20.0,
+    min_duration: float = 0.0,
+    num_mel_bins: int = 23,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    use_energy: bool = False,
+    use_log_fbank: bool = True,
+    use_power: bool = True,
+    vtln_high: float = -500.0,
+    vtln_low: float = 100.0,
+    vtln_warp: float = 1.0,
+    window_type: str = POVEY,
+) -> Tensor:
+    r"""Create a fbank from a raw audio signal. This matches the input/output of Kaldi's
+    compute-fbank-feats.
+    Args:
+        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
+        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
+        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
+        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
+            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
+        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
+            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
+            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
+        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
+        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
+        high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
+         (Default: ``0.0``)
+        htk_compat (bool, optional): If true, put energy last.  Warning: not sufficient to get HTK compatible features
+         (need to change other parameters). (Default: ``False``)
+        low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
+        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
+        num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
+        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
+        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
+        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. (Default: ``True``)
+        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
+            specified there) (Default: ``16000.0``)
+        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
+        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
+            it this way.  (Default: ``False``)
+        use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
+        use_log_fbank (bool, optional):If true, produce log-filterbank, else produce linear. (Default: ``True``)
+        use_power (bool, optional): If true, use power, else use magnitude. (Default: ``True``)
+        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
+            negative, offset from high-mel-freq (Default: ``-500.0``)
+        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
+        vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
+        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
+         (Default: ``'povey'``)
+    Returns:
+        Tensor: A fbank identical to what Kaldi would output. The shape is (m, ``num_mel_bins + use_energy``)
+        where m is calculated in _get_strided
+    """
+    device, dtype = waveform.device, waveform.dtype
+    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
+        waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
+    )
+    if len(waveform) < min_duration * sample_frequency:
+        # signal is too short
+        return torch.empty(0, device=device, dtype=dtype)
+    # strided_input, size (m, padded_window_size) and signal_log_energy, size (m)
+    strided_input, signal_log_energy = _get_window(
+        waveform,
+        padded_window_size,
+        window_size,
+        window_shift,
+        window_type,
+        blackman_coeff,
+        snip_edges,
+        raw_energy,
+        energy_floor,
+        dither,
+        remove_dc_offset,
+        preemphasis_coefficient,
+    )
+    # size (m, padded_window_size // 2 + 1)
+    spectrum = torch.fft.rfft(strided_input).abs()
+    if use_power:
+        spectrum = spectrum.pow(2.0)
+    # size (num_mel_bins, padded_window_size // 2)
+    # print(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp)
+    cache_key = "%s-%s-%s-%s-%s-%s-%s-%s-%s-%s" % (
+        num_mel_bins,
+        padded_window_size,
+        sample_frequency,
+        low_freq,
+        high_freq,
+        vtln_low,
+        vtln_high,
+        vtln_warp,
+        device,
+        dtype,
+    )
+    if cache_key not in cache:
+        mel_energies = get_mel_banks(
+            num_mel_bins,
+            padded_window_size,
+            sample_frequency,
+            low_freq,
+            high_freq,
+            vtln_low,
+            vtln_high,
+            vtln_warp,
+            device,
+            dtype,
+        )
+        cache[cache_key] = mel_energies
+    else:
+        mel_energies = cache[cache_key]
+    # pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1)
+    mel_energies = torch.nn.functional.pad(mel_energies, (0, 1), mode="constant", value=0)
+    # sum with mel fiterbanks over the power spectrum, size (m, num_mel_bins)
+    mel_energies = torch.mm(spectrum, mel_energies.T)
+    if use_log_fbank:
+        # avoid log of zero (which should be prevented anyway by dithering)
+        mel_energies = torch.max(mel_energies, _get_epsilon(device, dtype)).log()
+    # if use_energy then add it as the last column for htk_compat == true else first column
+    if use_energy:
+        signal_log_energy = signal_log_energy.unsqueeze(1)  # size (m, 1)
+        # returns size (m, num_mel_bins + 1)
+        if htk_compat:
+            mel_energies = torch.cat((mel_energies, signal_log_energy), dim=1)
+        else:
+            mel_energies = torch.cat((signal_log_energy, mel_energies), dim=1)
+    mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
+    return mel_energies
+def _get_dct_matrix(num_ceps: int, num_mel_bins: int) -> Tensor:
+    # returns a dct matrix of size (num_mel_bins, num_ceps)
+    # size (num_mel_bins, num_mel_bins)
+    dct_matrix = torchaudio.functional.create_dct(num_mel_bins, num_mel_bins, "ortho")
+    # kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins)
+    # this would be the first column in the dct_matrix for torchaudio as it expects a
+    # right multiply (which would be the first column of the kaldi's dct_matrix as kaldi
+    # expects a left multiply e.g. dct_matrix * vector).
+    dct_matrix[:, 0] = math.sqrt(1 / float(num_mel_bins))
+    dct_matrix = dct_matrix[:, :num_ceps]
+    return dct_matrix
+def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> Tensor:
+    # returns size (num_ceps)
+    # Compute liftering coefficients (scaling on cepstral coeffs)
+    # coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected.
+    i = torch.arange(num_ceps)
+    return 1.0 + 0.5 * cepstral_lifter * torch.sin(math.pi * i / cepstral_lifter)
+def mfcc(
+    waveform: Tensor,
+    blackman_coeff: float = 0.42,
+    cepstral_lifter: float = 22.0,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    high_freq: float = 0.0,
+    htk_compat: bool = False,
+    low_freq: float = 20.0,
+    num_ceps: int = 13,
+    min_duration: float = 0.0,
+    num_mel_bins: int = 23,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    use_energy: bool = False,
+    vtln_high: float = -500.0,
+    vtln_low: float = 100.0,
+    vtln_warp: float = 1.0,
+    window_type: str = POVEY,
+) -> Tensor:
+    r"""Create a mfcc from a raw audio signal. This matches the input/output of Kaldi's
+    compute-mfcc-feats.
+    Args:
+        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
+        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
+        cepstral_lifter (float, optional): Constant that controls scaling of MFCCs (Default: ``22.0``)
+        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
+        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
+            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
+        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
+            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
+            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
+        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
+        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
+        high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
+         (Default: ``0.0``)
+        htk_compat (bool, optional): If true, put energy last.  Warning: not sufficient to get HTK compatible
+         features (need to change other parameters). (Default: ``False``)
+        low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
+        num_ceps (int, optional): Number of cepstra in MFCC computation (including C0) (Default: ``13``)
+        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
+        num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
+        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
+        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
+        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. (Default: ``True``)
+        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
+            specified there) (Default: ``16000.0``)
+        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
+        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
+            it this way.  (Default: ``False``)
+        use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
+        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
+            negative, offset from high-mel-freq (Default: ``-500.0``)
+        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
+        vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
+        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
+         (Default: ``"povey"``)
+    Returns:
+        Tensor: A mfcc identical to what Kaldi would output. The shape is (m, ``num_ceps``)
+        where m is calculated in _get_strided
+    """
+    assert num_ceps <= num_mel_bins, "num_ceps cannot be larger than num_mel_bins: %d vs %d" % (num_ceps, num_mel_bins)
+    device, dtype = waveform.device, waveform.dtype
+    # The mel_energies should not be squared (use_power=True), not have mean subtracted
+    # (subtract_mean=False), and use log (use_log_fbank=True).
+    # size (m, num_mel_bins + use_energy)
+    feature = fbank(
+        waveform=waveform,
+        blackman_coeff=blackman_coeff,
+        channel=channel,
+        dither=dither,
+        energy_floor=energy_floor,
+        frame_length=frame_length,
+        frame_shift=frame_shift,
+        high_freq=high_freq,
+        htk_compat=htk_compat,
+        low_freq=low_freq,
+        min_duration=min_duration,
+        num_mel_bins=num_mel_bins,
+        preemphasis_coefficient=preemphasis_coefficient,
+        raw_energy=raw_energy,
+        remove_dc_offset=remove_dc_offset,
+        round_to_power_of_two=round_to_power_of_two,
+        sample_frequency=sample_frequency,
+        snip_edges=snip_edges,
+        subtract_mean=False,
+        use_energy=use_energy,
+        use_log_fbank=True,
+        use_power=True,
+        vtln_high=vtln_high,
+        vtln_low=vtln_low,
+        vtln_warp=vtln_warp,
+        window_type=window_type,
+    )
+    if use_energy:
+        # size (m)
+        signal_log_energy = feature[:, num_mel_bins if htk_compat else 0]
+        # offset is 0 if htk_compat==True else 1
+        mel_offset = int(not htk_compat)
+        feature = feature[:, mel_offset : (num_mel_bins + mel_offset)]
+    # size (num_mel_bins, num_ceps)
+    dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins).to(dtype=dtype, device=device)
+    # size (m, num_ceps)
+    feature = feature.matmul(dct_matrix)
+    if cepstral_lifter != 0.0:
+        # size (1, num_ceps)
+        lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze(0)
+        feature *= lifter_coeffs.to(device=device, dtype=dtype)
+    # if use_energy then replace the last column for htk_compat == true else first column
+    if use_energy:
+        feature[:, 0] = signal_log_energy
+    if htk_compat:
+        energy = feature[:, 0].unsqueeze(1)  # size (m, 1)
+        feature = feature[:, 1:]  # size (m, num_ceps - 1)
+        if not use_energy:
+            # scale on C0 (actually removing a scale we previously added that's
+            # part of one common definition of the cosine transform.)
+            energy *= math.sqrt(2)
+        feature = torch.cat((feature, energy), dim=1)
+    feature = _subtract_column_mean(feature, subtract_mean)
+    return feature

GPT_SoVITS/eres2net/pooling_layers.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+"""This implementation is adapted from https://github.com/wenet-e2e/wespeaker."""
+import torch
+import torch.nn as nn
+class TAP(nn.Module):
+    """
+    Temporal average pooling, only first-order mean is considered
+    """
+    def __init__(self, **kwargs):
+        super(TAP, self).__init__()
+    def forward(self, x):
+        pooling_mean = x.mean(dim=-1)
+        # To be compatable with 2D input
+        pooling_mean = pooling_mean.flatten(start_dim=1)
+        return pooling_mean
+class TSDP(nn.Module):
+    """
+    Temporal standard deviation pooling, only second-order std is considered
+    """
+    def __init__(self, **kwargs):
+        super(TSDP, self).__init__()
+    def forward(self, x):
+        # The last dimension is the temporal axis
+        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
+        pooling_std = pooling_std.flatten(start_dim=1)
+        return pooling_std
+class TSTP(nn.Module):
+    """
+    Temporal statistics pooling, concatenate mean and std, which is used in
+    x-vector
+    Comment: simple concatenation can not make full use of both statistics
+    """
+    def __init__(self, **kwargs):
+        super(TSTP, self).__init__()
+    def forward(self, x):
+        # The last dimension is the temporal axis
+        pooling_mean = x.mean(dim=-1)
+        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
+        pooling_mean = pooling_mean.flatten(start_dim=1)
+        pooling_std = pooling_std.flatten(start_dim=1)
+        stats = torch.cat((pooling_mean, pooling_std), 1)
+        return stats
+class ASTP(nn.Module):
+    """Attentive statistics pooling: Channel- and context-dependent
+    statistics pooling, first used in ECAPA_TDNN.
+    """
+    def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
+        super(ASTP, self).__init__()
+        self.global_context_att = global_context_att
+        # Use Conv1d with stride == 1 rather than Linear, then we don't
+        # need to transpose inputs.
+        if global_context_att:
+            self.linear1 = nn.Conv1d(in_dim * 3, bottleneck_dim, kernel_size=1)  # equals W and b in the paper
+        else:
+            self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1)  # equals W and b in the paper
+        self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1)  # equals V and k in the paper
+    def forward(self, x):
+        """
+        x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
+            or a 4-dimensional tensor in resnet architecture (B,C,F,T)
+            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
+        """
+        if len(x.shape) == 4:
+            x = x.reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
+        assert len(x.shape) == 3
+        if self.global_context_att:
+            context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
+            context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
+            x_in = torch.cat((x, context_mean, context_std), dim=1)
+        else:
+            x_in = x
+        # DON'T use ReLU here! ReLU may be hard to converge.
+        alpha = torch.tanh(self.linear1(x_in))  # alpha = F.relu(self.linear1(x_in))
+        alpha = torch.softmax(self.linear2(alpha), dim=2)
+        mean = torch.sum(alpha * x, dim=2)
+        var = torch.sum(alpha * (x**2), dim=2) - mean**2
+        std = torch.sqrt(var.clamp(min=1e-10))
+        return torch.cat([mean, std], dim=1)

GPT_SoVITS/f5_tts/model/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .backbones.dit import DiT
2	+
3	+ __all__ = ["DiT"]

GPT_SoVITS/f5_tts/model/backbones/README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+## Backbones quick introduction
+### unett.py
+- flat unet transformer
+- structure same as in e2-tts & voicebox paper except using rotary pos emb
+- update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat
+### dit.py
+- adaln-zero dit
+- embedded timestep as condition
+- concatted noised_input + masked_cond + embedded_text, linear proj in
+- possible abs pos emb & convnextv2 blocks for embedded text before concat
+- possible long skip connection (first layer to last layer)
+### mmdit.py
+- sd3 structure
+- timestep as condition
+- left stream: text embedded and applied a abs pos emb
+- right stream: masked_cond & noised_input concatted and with same conv pos emb as unett

GPT_SoVITS/f5_tts/model/backbones/dit.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+import torch
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+from x_transformers.x_transformers import RotaryEmbedding
+from GPT_SoVITS.module.commons import sequence_mask
+from ..modules import (
+    AdaLayerNormZero_Final,
+    ConvNeXtV2Block,
+    ConvPositionEmbedding,
+    DiTBlock,
+    TimestepEmbedding,
+    get_pos_embed_indices,
+    precompute_freqs_cis,
+)
+class TextEmbedding(nn.Module):
+    def __init__(self, text_dim, conv_layers=0, conv_mult=2):
+        super().__init__()
+        if conv_layers > 0:
+            self.extra_modeling = True
+            self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
+            self.text_blocks = nn.Sequential(
+                *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
+            )
+        else:
+            self.extra_modeling = False
+    def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
+        batch, text_len = text.shape[0], text.shape[1]
+        if drop_text:  # cfg for text
+            text = torch.zeros_like(text)
+        # possible extra modeling
+        if self.extra_modeling:
+            # sinus pos emb
+            batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
+            text_pos_embed = self.freqs_cis[pos_idx]
+            # print(23333333,text.shape,text_pos_embed.shape)#torch.Size([7, 465, 256]) torch.Size([7, 465, 256])
+            text = text + text_pos_embed
+            # convnextv2 blocks
+            text = self.text_blocks(text)
+        return text
+# noised input audio and context mixing embedding
+class InputEmbedding(nn.Module):
+    def __init__(self, mel_dim, text_dim, out_dim):
+        super().__init__()
+        self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
+        self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
+    def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False):  # noqa: F722
+        if drop_audio_cond:  # cfg for cond audio
+            cond = torch.zeros_like(cond)
+        x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
+        x = self.conv_pos_embed(x) + x
+        return x
+# Transformer backbone using DiT blocks
+class DiT(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=8,
+        heads=8,
+        dim_head=64,
+        dropout=0.1,
+        ff_mult=4,
+        mel_dim=100,
+        text_dim=None,
+        conv_layers=0,
+        long_skip_connection=False,
+    ):
+        super().__init__()
+        self.time_embed = TimestepEmbedding(dim)
+        self.d_embed = TimestepEmbedding(dim)
+        if text_dim is None:
+            text_dim = mel_dim
+        self.text_embed = TextEmbedding(text_dim, conv_layers=conv_layers)
+        self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
+        self.rotary_embed = RotaryEmbedding(dim_head)
+        self.dim = dim
+        self.depth = depth
+        self.transformer_blocks = nn.ModuleList(
+            [DiTBlock(dim=dim, heads=heads, dim_head=dim_head, ff_mult=ff_mult, dropout=dropout) for _ in range(depth)]
+        )
+        self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
+        self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
+        self.proj_out = nn.Linear(dim, mel_dim)
+    def ckpt_wrapper(self, module):
+        # https://github.com/chuanyangjin/fast-DiT/blob/main/models.py
+        def ckpt_forward(*inputs):
+            outputs = module(*inputs)
+            return outputs
+        return ckpt_forward
+    def forward(  # x, prompt_x, x_lens, t, style,cond
+        self,  # d is channel,n is T
+        x0: float["b n d"],  # nosied input audio  # noqa: F722
+        cond0: float["b n d"],  # masked cond audio  # noqa: F722
+        x_lens,
+        time: float["b"] | float[""],  # time step  # noqa: F821 F722
+        dt_base_bootstrap,
+        text0,  # : int["b nt"]  # noqa: F722#####condition feature
+        use_grad_ckpt=False,  # bool
+        ###no-use
+        drop_audio_cond=False,  # cfg for cond audio
+        drop_text=False,  # cfg for text
+        # mask: bool["b n"] | None = None,  # noqa: F722
+        infer=False,  # bool
+        text_cache=None,  # torch tensor as text_embed
+        dt_cache=None,  # torch tensor as dt
+    ):
+        x = x0.transpose(2, 1)
+        cond = cond0.transpose(2, 1)
+        text = text0.transpose(2, 1)
+        mask = sequence_mask(x_lens, max_length=x.size(1)).to(x.device)
+        batch, seq_len = x.shape[0], x.shape[1]
+        if time.ndim == 0:
+            time = time.repeat(batch)
+        # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
+        t = self.time_embed(time)
+        if infer and dt_cache is not None:
+            dt = dt_cache
+        else:
+            dt = self.d_embed(dt_base_bootstrap)
+        t += dt
+        if infer and text_cache is not None:
+            text_embed = text_cache
+        else:
+            text_embed = self.text_embed(text, seq_len, drop_text=drop_text)  ###need to change
+        x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
+        rope = self.rotary_embed.forward_from_seq_len(seq_len)
+        if self.long_skip_connection is not None:
+            residual = x
+        for block in self.transformer_blocks:
+            if use_grad_ckpt:
+                x = checkpoint(self.ckpt_wrapper(block), x, t, mask, rope, use_reentrant=False)
+            else:
+                x = block(x, t, mask=mask, rope=rope)
+        if self.long_skip_connection is not None:
+            x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
+        x = self.norm_out(x, t)
+        output = self.proj_out(x)
+        if infer:
+            return output, text_embed, dt
+        else:
+            return output

GPT_SoVITS/f5_tts/model/backbones/mmdit.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+import torch
+from torch import nn
+from x_transformers.x_transformers import RotaryEmbedding
+from ..modules import (
+    AdaLayerNormZero_Final,
+    ConvPositionEmbedding,
+    MMDiTBlock,
+    TimestepEmbedding,
+    get_pos_embed_indices,
+    precompute_freqs_cis,
+)
+# text embedding
+class TextEmbedding(nn.Module):
+    def __init__(self, out_dim, text_num_embeds):
+        super().__init__()
+        self.text_embed = nn.Embedding(text_num_embeds + 1, out_dim)  # will use 0 as filler token
+        self.precompute_max_pos = 1024
+        self.register_buffer("freqs_cis", precompute_freqs_cis(out_dim, self.precompute_max_pos), persistent=False)
+    def forward(self, text: int["b nt"], drop_text=False) -> int["b nt d"]:  # noqa: F722
+        text = text + 1
+        if drop_text:
+            text = torch.zeros_like(text)
+        text = self.text_embed(text)
+        # sinus pos emb
+        batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
+        batch_text_len = text.shape[1]
+        pos_idx = get_pos_embed_indices(batch_start, batch_text_len, max_pos=self.precompute_max_pos)
+        text_pos_embed = self.freqs_cis[pos_idx]
+        text = text + text_pos_embed
+        return text
+# noised input & masked cond audio embedding
+class AudioEmbedding(nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.linear = nn.Linear(2 * in_dim, out_dim)
+        self.conv_pos_embed = ConvPositionEmbedding(out_dim)
+    def forward(self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False):  # noqa: F722
+        if drop_audio_cond:
+            cond = torch.zeros_like(cond)
+        x = torch.cat((x, cond), dim=-1)
+        x = self.linear(x)
+        x = self.conv_pos_embed(x) + x
+        return x
+# Transformer backbone using MM-DiT blocks
+class MMDiT(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=8,
+        heads=8,
+        dim_head=64,
+        dropout=0.1,
+        ff_mult=4,
+        text_num_embeds=256,
+        mel_dim=100,
+    ):
+        super().__init__()
+        self.time_embed = TimestepEmbedding(dim)
+        self.text_embed = TextEmbedding(dim, text_num_embeds)
+        self.audio_embed = AudioEmbedding(mel_dim, dim)
+        self.rotary_embed = RotaryEmbedding(dim_head)
+        self.dim = dim
+        self.depth = depth
+        self.transformer_blocks = nn.ModuleList(
+            [
+                MMDiTBlock(
+                    dim=dim,
+                    heads=heads,
+                    dim_head=dim_head,
+                    dropout=dropout,
+                    ff_mult=ff_mult,
+                    context_pre_only=i == depth - 1,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
+        self.proj_out = nn.Linear(dim, mel_dim)
+    def forward(
+        self,
+        x: float["b n d"],  # nosied input audio  # noqa: F722
+        cond: float["b n d"],  # masked cond audio  # noqa: F722
+        text: int["b nt"],  # text  # noqa: F722
+        time: float["b"] | float[""],  # time step  # noqa: F821 F722
+        drop_audio_cond,  # cfg for cond audio
+        drop_text,  # cfg for text
+        mask: bool["b n"] | None = None,  # noqa: F722
+    ):
+        batch = x.shape[0]
+        if time.ndim == 0:
+            time = time.repeat(batch)
+        # t: conditioning (time), c: context (text + masked cond audio), x: noised input audio
+        t = self.time_embed(time)
+        c = self.text_embed(text, drop_text=drop_text)
+        x = self.audio_embed(x, cond, drop_audio_cond=drop_audio_cond)
+        seq_len = x.shape[1]
+        text_len = text.shape[1]
+        rope_audio = self.rotary_embed.forward_from_seq_len(seq_len)
+        rope_text = self.rotary_embed.forward_from_seq_len(text_len)
+        for block in self.transformer_blocks:
+            c, x = block(x, c, t, mask=mask, rope=rope_audio, c_rope=rope_text)
+        x = self.norm_out(x, t)
+        output = self.proj_out(x)
+        return output

GPT_SoVITS/f5_tts/model/backbones/unett.py ADDED Viewed

	@@ -0,0 +1,218 @@

+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+from typing import Literal
+import torch
+import torch.nn.functional as F
+from torch import nn
+from x_transformers import RMSNorm
+from x_transformers.x_transformers import RotaryEmbedding
+from ..modules import (
+    Attention,
+    AttnProcessor,
+    ConvNeXtV2Block,
+    ConvPositionEmbedding,
+    FeedForward,
+    TimestepEmbedding,
+    get_pos_embed_indices,
+    precompute_freqs_cis,
+)
+# Text embedding
+class TextEmbedding(nn.Module):
+    def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
+        super().__init__()
+        self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim)  # use 0 as filler token
+        if conv_layers > 0:
+            self.extra_modeling = True
+            self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
+            self.text_blocks = nn.Sequential(
+                *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
+            )
+        else:
+            self.extra_modeling = False
+    def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
+        text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
+        text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
+        batch, text_len = text.shape[0], text.shape[1]
+        text = F.pad(text, (0, seq_len - text_len), value=0)
+        if drop_text:  # cfg for text
+            text = torch.zeros_like(text)
+        text = self.text_embed(text)  # b n -> b n d
+        # possible extra modeling
+        if self.extra_modeling:
+            # sinus pos emb
+            batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
+            text_pos_embed = self.freqs_cis[pos_idx]
+            text = text + text_pos_embed
+            # convnextv2 blocks
+            text = self.text_blocks(text)
+        return text
+# noised input audio and context mixing embedding
+class InputEmbedding(nn.Module):
+    def __init__(self, mel_dim, text_dim, out_dim):
+        super().__init__()
+        self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
+        self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
+    def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False):  # noqa: F722
+        if drop_audio_cond:  # cfg for cond audio
+            cond = torch.zeros_like(cond)
+        x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
+        x = self.conv_pos_embed(x) + x
+        return x
+# Flat UNet Transformer backbone
+class UNetT(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=8,
+        heads=8,
+        dim_head=64,
+        dropout=0.1,
+        ff_mult=4,
+        mel_dim=100,
+        text_num_embeds=256,
+        text_dim=None,
+        conv_layers=0,
+        skip_connect_type: Literal["add", "concat", "none"] = "concat",
+    ):
+        super().__init__()
+        assert depth % 2 == 0, "UNet-Transformer's depth should be even."
+        self.time_embed = TimestepEmbedding(dim)
+        if text_dim is None:
+            text_dim = mel_dim
+        self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
+        self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
+        self.rotary_embed = RotaryEmbedding(dim_head)
+        # transformer layers & skip connections
+        self.dim = dim
+        self.skip_connect_type = skip_connect_type
+        needs_skip_proj = skip_connect_type == "concat"
+        self.depth = depth
+        self.layers = nn.ModuleList([])
+        for idx in range(depth):
+            is_later_half = idx >= (depth // 2)
+            attn_norm = RMSNorm(dim)
+            attn = Attention(
+                processor=AttnProcessor(),
+                dim=dim,
+                heads=heads,
+                dim_head=dim_head,
+                dropout=dropout,
+            )
+            ff_norm = RMSNorm(dim)
+            ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+            skip_proj = nn.Linear(dim * 2, dim, bias=False) if needs_skip_proj and is_later_half else None
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        skip_proj,
+                        attn_norm,
+                        attn,
+                        ff_norm,
+                        ff,
+                    ]
+                )
+            )
+        self.norm_out = RMSNorm(dim)
+        self.proj_out = nn.Linear(dim, mel_dim)
+    def forward(
+        self,
+        x: float["b n d"],  # nosied input audio  # noqa: F722
+        cond: float["b n d"],  # masked cond audio  # noqa: F722
+        text: int["b nt"],  # text  # noqa: F722
+        time: float["b"] | float[""],  # time step  # noqa: F821 F722
+        drop_audio_cond,  # cfg for cond audio
+        drop_text,  # cfg for text
+        mask: bool["b n"] | None = None,  # noqa: F722
+    ):
+        batch, seq_len = x.shape[0], x.shape[1]
+        if time.ndim == 0:
+            time = time.repeat(batch)
+        # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
+        t = self.time_embed(time)
+        text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
+        x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
+        # postfix time t to input x, [b n d] -> [b n+1 d]
+        x = torch.cat([t.unsqueeze(1), x], dim=1)  # pack t to x
+        if mask is not None:
+            mask = F.pad(mask, (1, 0), value=1)
+        rope = self.rotary_embed.forward_from_seq_len(seq_len + 1)
+        # flat unet transformer
+        skip_connect_type = self.skip_connect_type
+        skips = []
+        for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(self.layers):
+            layer = idx + 1
+            # skip connection logic
+            is_first_half = layer <= (self.depth // 2)
+            is_later_half = not is_first_half
+            if is_first_half:
+                skips.append(x)
+            if is_later_half:
+                skip = skips.pop()
+                if skip_connect_type == "concat":
+                    x = torch.cat((x, skip), dim=-1)
+                    x = maybe_skip_proj(x)
+                elif skip_connect_type == "add":
+                    x = x + skip
+            # attention and feedforward blocks
+            x = attn(attn_norm(x), rope=rope, mask=mask) + x
+            x = ff(ff_norm(x)) + x
+        assert len(skips) == 0
+        x = self.norm_out(x)[:, 1:, :]  # unpack t from x
+        return self.proj_out(x)

GPT_SoVITS/f5_tts/model/modules.py ADDED Viewed

	@@ -0,0 +1,665 @@

+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+import math
+from typing import Optional
+import torch
+import torch.nn.functional as F
+import torchaudio
+from librosa.filters import mel as librosa_mel_fn
+from torch import nn
+from x_transformers.x_transformers import apply_rotary_pos_emb
+# raw wav to mel spec
+mel_basis_cache = {}
+hann_window_cache = {}
+def get_bigvgan_mel_spectrogram(
+    waveform,
+    n_fft=1024,
+    n_mel_channels=100,
+    target_sample_rate=24000,
+    hop_length=256,
+    win_length=1024,
+    fmin=0,
+    fmax=None,
+    center=False,
+):  # Copy from https://github.com/NVIDIA/BigVGAN/tree/main
+    device = waveform.device
+    key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
+    if key not in mel_basis_cache:
+        mel = librosa_mel_fn(sr=target_sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=fmin, fmax=fmax)
+        mel_basis_cache[key] = torch.from_numpy(mel).float().to(device)  # TODO: why they need .float()?
+        hann_window_cache[key] = torch.hann_window(win_length).to(device)
+    mel_basis = mel_basis_cache[key]
+    hann_window = hann_window_cache[key]
+    padding = (n_fft - hop_length) // 2
+    waveform = torch.nn.functional.pad(waveform.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1)
+    spec = torch.stft(
+        waveform,
+        n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=hann_window,
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
+    mel_spec = torch.matmul(mel_basis, spec)
+    mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
+    return mel_spec
+def get_vocos_mel_spectrogram(
+    waveform,
+    n_fft=1024,
+    n_mel_channels=100,
+    target_sample_rate=24000,
+    hop_length=256,
+    win_length=1024,
+):
+    mel_stft = torchaudio.transforms.MelSpectrogram(
+        sample_rate=target_sample_rate,
+        n_fft=n_fft,
+        win_length=win_length,
+        hop_length=hop_length,
+        n_mels=n_mel_channels,
+        power=1,
+        center=True,
+        normalized=False,
+        norm=None,
+    ).to(waveform.device)
+    if len(waveform.shape) == 3:
+        waveform = waveform.squeeze(1)  # 'b 1 nw -> b nw'
+    assert len(waveform.shape) == 2
+    mel = mel_stft(waveform)
+    mel = mel.clamp(min=1e-5).log()
+    return mel
+class MelSpec(nn.Module):
+    def __init__(
+        self,
+        n_fft=1024,
+        hop_length=256,
+        win_length=1024,
+        n_mel_channels=100,
+        target_sample_rate=24_000,
+        mel_spec_type="vocos",
+    ):
+        super().__init__()
+        assert mel_spec_type in ["vocos", "bigvgan"], print("We only support two extract mel backend: vocos or bigvgan")
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.n_mel_channels = n_mel_channels
+        self.target_sample_rate = target_sample_rate
+        if mel_spec_type == "vocos":
+            self.extractor = get_vocos_mel_spectrogram
+        elif mel_spec_type == "bigvgan":
+            self.extractor = get_bigvgan_mel_spectrogram
+        self.register_buffer("dummy", torch.tensor(0), persistent=False)
+    def forward(self, wav):
+        if self.dummy.device != wav.device:
+            self.to(wav.device)
+        mel = self.extractor(
+            waveform=wav,
+            n_fft=self.n_fft,
+            n_mel_channels=self.n_mel_channels,
+            target_sample_rate=self.target_sample_rate,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+        )
+        return mel
+# sinusoidal position embedding
+class SinusPositionEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x, scale=1000):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+# convolutional position embedding
+class ConvPositionEmbedding(nn.Module):
+    def __init__(self, dim, kernel_size=31, groups=16):
+        super().__init__()
+        assert kernel_size % 2 != 0
+        self.conv1d = nn.Sequential(
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
+            nn.Mish(),
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
+            nn.Mish(),
+        )
+    def forward(self, x: float["b n d"], mask: bool["b n"] | None = None):  # noqa: F722
+        if mask is not None:
+            mask = mask[..., None]
+            x = x.masked_fill(~mask, 0.0)
+        x = x.permute(0, 2, 1)
+        x = self.conv1d(x)
+        out = x.permute(0, 2, 1)
+        if mask is not None:
+            out = out.masked_fill(~mask, 0.0)
+        return out
+# rotary positional embedding related
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
+    # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+    # has some connection to NTK literature
+    # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+    # https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
+    theta *= theta_rescale_factor ** (dim / (dim - 2))
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cos = torch.cos(freqs)  # real part
+    freqs_sin = torch.sin(freqs)  # imaginary part
+    return torch.cat([freqs_cos, freqs_sin], dim=-1)
+def get_pos_embed_indices(start, length, max_pos, scale=1.0):
+    # length = length if isinstance(length, int) else length.max()
+    scale = scale * torch.ones_like(start, dtype=torch.float32)  # in case scale is a scalar
+    pos = (
+        start.unsqueeze(1)
+        + (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
+    )
+    # avoid extra long error.
+    pos = torch.where(pos < max_pos, pos, max_pos - 1)
+    return pos
+# Global Response Normalization layer (Instance Normalization ?)
+class GRN(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, dim))
+    def forward(self, x):
+        Gx = torch.norm(x, p=2, dim=1, keepdim=True)
+        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * Nx) + self.beta + x
+# ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
+# ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
+class ConvNeXtV2Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        dilation: int = 1,
+    ):
+        super().__init__()
+        padding = (dilation * (7 - 1)) // 2
+        self.dwconv = nn.Conv1d(
+            dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
+        )  # depthwise conv
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.grn = GRN(intermediate_dim)
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        x = x.transpose(1, 2)  # b n d -> b d n
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # b d n -> b n d
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.grn(x)
+        x = self.pwconv2(x)
+        return residual + x
+# AdaLayerNormZero
+# return with modulated x for attn input, and params for later mlp modulation
+class AdaLayerNormZero(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 6)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb=None):
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+# AdaLayerNormZero for final layer
+# return only with modulated x for attn input, cuz no more mlp modulation
+class AdaLayerNormZero_Final(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 2)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb):
+        emb = self.linear(self.silu(emb))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+# FeedForward
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        activation = nn.GELU(approximate=approximate)
+        project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
+        self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
+    def forward(self, x):
+        return self.ff(x)
+# Attention with possible joint part
+# modified from diffusers/src/diffusers/models/attention_processor.py
+class Attention(nn.Module):
+    def __init__(
+        self,
+        processor: JointAttnProcessor | AttnProcessor,
+        dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        context_dim: Optional[int] = None,  # if not None -> joint attention
+        context_pre_only=None,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.processor = processor
+        self.dim = dim
+        self.heads = heads
+        self.inner_dim = dim_head * heads
+        self.dropout = dropout
+        self.context_dim = context_dim
+        self.context_pre_only = context_pre_only
+        self.to_q = nn.Linear(dim, self.inner_dim)
+        self.to_k = nn.Linear(dim, self.inner_dim)
+        self.to_v = nn.Linear(dim, self.inner_dim)
+        if self.context_dim is not None:
+            self.to_k_c = nn.Linear(context_dim, self.inner_dim)
+            self.to_v_c = nn.Linear(context_dim, self.inner_dim)
+            if self.context_pre_only is not None:
+                self.to_q_c = nn.Linear(context_dim, self.inner_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(self.inner_dim, dim))
+        self.to_out.append(nn.Dropout(dropout))
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_out_c = nn.Linear(self.inner_dim, dim)
+    def forward(
+        self,
+        x: float["b n d"],  # noised input x  # noqa: F722
+        c: float["b n d"] = None,  # context c  # noqa: F722
+        mask: bool["b n"] | None = None,  # noqa: F722
+        rope=None,  # rotary position embedding for x
+        c_rope=None,  # rotary position embedding for c
+    ) -> torch.Tensor:
+        if c is not None:
+            return self.processor(self, x, c=c, mask=mask, rope=rope, c_rope=c_rope)
+        else:
+            return self.processor(self, x, mask=mask, rope=rope)
+# Attention processor
+# from torch.nn.attention import SDPBackend
+# torch.backends.cuda.enable_flash_sdp(True)
+class AttnProcessor:
+    def __init__(self):
+        pass
+    def __call__(
+        self,
+        attn: Attention,
+        x: float["b n d"],  # noised input x  # noqa: F722
+        mask: bool["b n"] | None = None,  # noqa: F722
+        rope=None,  # rotary position embedding
+    ) -> torch.FloatTensor:
+        batch_size = x.shape[0]
+        # `sample` projections.
+        query = attn.to_q(x)
+        key = attn.to_k(x)
+        value = attn.to_v(x)
+        # apply rotary position embedding
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+            key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+        # attention
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # mask. e.g. inference got a batch with different target durations, mask out the padding
+        if mask is not None:
+            attn_mask = mask
+            attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
+            # print(3433333333,attn_mask.shape)
+            attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
+        else:
+            attn_mask = None
+        # with torch.nn.attention.sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
+        # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=True):
+        # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=False):
+        #     print(torch.backends.cuda.flash_sdp_enabled())
+        #     print(torch.backends.cuda.mem_efficient_sdp_enabled())
+        #     print(torch.backends.cuda.math_sdp_enabled())
+        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
+        x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        x = x.to(query.dtype)
+        # linear proj
+        x = attn.to_out[0](x)
+        # dropout
+        x = attn.to_out[1](x)
+        if mask is not None:
+            mask = mask.unsqueeze(-1)
+            x = x.masked_fill(~mask, 0.0)
+        return x
+# Joint Attention processor for MM-DiT
+# modified from diffusers/src/diffusers/models/attention_processor.py
+class JointAttnProcessor:
+    def __init__(self):
+        pass
+    def __call__(
+        self,
+        attn: Attention,
+        x: float["b n d"],  # noised input x  # noqa: F722
+        c: float["b nt d"] = None,  # context c, here text # noqa: F722
+        mask: bool["b n"] | None = None,  # noqa: F722
+        rope=None,  # rotary position embedding for x
+        c_rope=None,  # rotary position embedding for c
+    ) -> torch.FloatTensor:
+        residual = x
+        batch_size = c.shape[0]
+        # `sample` projections.
+        query = attn.to_q(x)
+        key = attn.to_k(x)
+        value = attn.to_v(x)
+        # `context` projections.
+        c_query = attn.to_q_c(c)
+        c_key = attn.to_k_c(c)
+        c_value = attn.to_v_c(c)
+        # apply rope for context and noised input independently
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+            key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+        if c_rope is not None:
+            freqs, xpos_scale = c_rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
+            c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
+        # attention
+        query = torch.cat([query, c_query], dim=1)
+        key = torch.cat([key, c_key], dim=1)
+        value = torch.cat([value, c_value], dim=1)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # mask. e.g. inference got a batch with different target durations, mask out the padding
+        if mask is not None:
+            attn_mask = F.pad(mask, (0, c.shape[1]), value=True)  # no mask for c (text)
+            attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
+            attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
+        else:
+            attn_mask = None
+        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
+        x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        x = x.to(query.dtype)
+        # Split the attention outputs.
+        x, c = (
+            x[:, : residual.shape[1]],
+            x[:, residual.shape[1] :],
+        )
+        # linear proj
+        x = attn.to_out[0](x)
+        # dropout
+        x = attn.to_out[1](x)
+        if not attn.context_pre_only:
+            c = attn.to_out_c(c)
+        if mask is not None:
+            mask = mask.unsqueeze(-1)
+            x = x.masked_fill(~mask, 0.0)
+            # c = c.masked_fill(~mask, 0.)  # no mask for c (text)
+        return x, c
+# DiT Block
+class DiTBlock(nn.Module):
+    def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1):
+        super().__init__()
+        self.attn_norm = AdaLayerNormZero(dim)
+        self.attn = Attention(
+            processor=AttnProcessor(),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+        )
+        self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+    def forward(self, x, t, mask=None, rope=None):  # x: noised input, t: time embedding
+        # pre-norm & modulation for attention input
+        norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
+        # attention
+        attn_output = self.attn(x=norm, mask=mask, rope=rope)
+        # process attention output for input x
+        x = x + gate_msa.unsqueeze(1) * attn_output
+        norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(norm)
+        x = x + gate_mlp.unsqueeze(1) * ff_output
+        return x
+# MMDiT Block https://arxiv.org/abs/2403.03206
+class MMDiTBlock(nn.Module):
+    r"""
+    modified from diffusers/src/diffusers/models/attention.py
+    notes.
+    _c: context related. text, cond, etc. (left part in sd3 fig2.b)
+    _x: noised input related. (right part)
+    context_pre_only: last layer only do prenorm + modulation cuz no more ffn
+    """
+    def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_pre_only=False):
+        super().__init__()
+        self.context_pre_only = context_pre_only
+        self.attn_norm_c = AdaLayerNormZero_Final(dim) if context_pre_only else AdaLayerNormZero(dim)
+        self.attn_norm_x = AdaLayerNormZero(dim)
+        self.attn = Attention(
+            processor=JointAttnProcessor(),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+            context_dim=dim,
+            context_pre_only=context_pre_only,
+        )
+        if not context_pre_only:
+            self.ff_norm_c = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+            self.ff_c = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+        else:
+            self.ff_norm_c = None
+            self.ff_c = None
+        self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+    def forward(self, x, c, t, mask=None, rope=None, c_rope=None):  # x: noised input, c: context, t: time embedding
+        # pre-norm & modulation for attention input
+        if self.context_pre_only:
+            norm_c = self.attn_norm_c(c, t)
+        else:
+            norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
+        norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
+        # attention
+        x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope)
+        # process attention output for context c
+        if self.context_pre_only:
+            c = None
+        else:  # if not last layer
+            c = c + c_gate_msa.unsqueeze(1) * c_attn_output
+            norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+            c_ff_output = self.ff_c(norm_c)
+            c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
+        # process attention output for input x
+        x = x + x_gate_msa.unsqueeze(1) * x_attn_output
+        norm_x = self.ff_norm_x(x) * (1 + x_scale_mlp[:, None]) + x_shift_mlp[:, None]
+        x_ff_output = self.ff_x(norm_x)
+        x = x + x_gate_mlp.unsqueeze(1) * x_ff_output
+        return c, x
+# time step conditioning embedding
+class TimestepEmbedding(nn.Module):
+    def __init__(self, dim, freq_embed_dim=256):
+        super().__init__()
+        self.time_embed = SinusPositionEmbedding(freq_embed_dim)
+        self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+    def forward(self, timestep: float["b"]):  # noqa: F821
+        time_hidden = self.time_embed(timestep)
+        time_hidden = time_hidden.to(timestep.dtype)
+        time = self.time_mlp(time_hidden)  # b d
+        return time

GPT_SoVITS/feature_extractor/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from . import cnhubert
2	+
3	+ content_module_map = {"cnhubert": cnhubert}

GPT_SoVITS/feature_extractor/cnhubert.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import logging
+import os
+import torch
+import torch.nn as nn
+from transformers import (
+    HubertModel,
+    Wav2Vec2FeatureExtractor,
+)
+from transformers import logging as tf_logging
+tf_logging.set_verbosity_error()
+logging.getLogger("numba").setLevel(logging.WARNING)
+cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
+class CNHubert(nn.Module):
+    def __init__(self, base_path: str = ""):
+        super().__init__()
+        if not base_path:
+            base_path = cnhubert_base_path
+        if os.path.exists(base_path):
+            ...
+        else:
+            raise FileNotFoundError(base_path)
+        self.model = HubertModel.from_pretrained(base_path, local_files_only=True)
+        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(base_path, local_files_only=True)
+    def forward(self, x):
+        input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
+        feats = self.model(input_values)["last_hidden_state"]
+        return feats
+def get_model():
+    model = CNHubert()
+    model.eval()
+    return model
+def get_content(hmodel, wav_16k_tensor):
+    with torch.no_grad():
+        feats = hmodel(wav_16k_tensor)
+    return feats.transpose(1, 2)

GPT_SoVITS/inference_webui.py ADDED Viewed

	@@ -0,0 +1,1104 @@

+import argparse
+import contextlib
+import logging
+import os
+import re
+import shutil
+import traceback
+import warnings
+import zipfile
+from functools import partial
+from pathlib import Path
+from time import time as ttime
+from typing import Any
+import gradio as gr
+import librosa
+import numpy as np
+import spaces
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+from config import (
+    change_choices,
+    get_dtype,
+    get_weights_names,
+)
+from config import (
+    infer_device as default_device,
+)
+from GPT_SoVITS.Accelerate import PyTorch, T2SEngineProtocol, T2SRequest, backends
+from GPT_SoVITS.Accelerate.logger import console
+from GPT_SoVITS.feature_extractor import cnhubert
+from GPT_SoVITS.module.mel_processing import mel_spectrogram_torch, spectrogram_torch
+from GPT_SoVITS.module.models import SynthesizerTrn
+from GPT_SoVITS.process_ckpt import inspect_version
+from GPT_SoVITS.sv import SV
+from GPT_SoVITS.text import cleaned_text_to_sequence
+from GPT_SoVITS.text.cleaner import clean_text
+from GPT_SoVITS.text.LangSegmenter import LangSegmenter
+from tools.assets import css, js, top_html
+from tools.i18n.i18n import I18nAuto, scan_language_list
+from tools.my_utils import DictToAttrRecursive
+warnings.filterwarnings(
+    "ignore", message="MPS: The constant padding of more than 3 dimensions is not currently supported natively."
+)
+warnings.filterwarnings("ignore", message=".*ComplexHalf support is experimental.*")
+logging.getLogger("markdown_it").setLevel(logging.ERROR)
+logging.getLogger("urllib3").setLevel(logging.ERROR)
+logging.getLogger("httpcore").setLevel(logging.ERROR)
+logging.getLogger("httpx").setLevel(logging.ERROR)
+logging.getLogger("asyncio").setLevel(logging.ERROR)
+logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
+logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
+logging.getLogger("multipart.multipart").setLevel(logging.ERROR)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+def install():
+    base = Path("GPT_SoVITS")
+    zip_path = hf_hub_download("XXXXRT/GPT-SoVITS-Pretrained", "pretrained_models.zip", repo_type="model")
+    tmp = base / "tmp_unzip"
+    if tmp.exists():
+        shutil.rmtree(tmp)
+    with zipfile.ZipFile(zip_path) as zf:
+        zf.extractall(tmp)
+    folder = next(tmp.iterdir())
+    shutil.move(str(folder), base / folder.name)
+    shutil.rmtree(tmp)
+install()
+_LANG_RE = re.compile(r"^[a-z]{2}[_-][A-Z]{2}$")
+def lang_type(text: str) -> str:
+    if text == "Auto":
+        return text
+    if not _LANG_RE.match(text):
+        raise argparse.ArgumentTypeError(f"Unspported Format: {text}, Expected ll_CC/ll-CC")
+    ll, cc = re.split(r"[_-]", text)
+    language = f"{ll}_{cc}"
+    if language in scan_language_list():
+        return language
+    else:
+        return "Auto"
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="inference_webui",
+        description=f"python -s -m GPT_SoVITS.inference_webui zh_CN -b {backends[-1]}",
+    )
+    p.add_argument(
+        "language",
+        nargs="?",
+        default="Auto",
+        type=lang_type,
+        help="Language Code, Such as zh_CN, en-US",
+    )
+    p.add_argument(
+        "--backends",
+        "-b",
+        choices=backends,
+        default=backends[-1],
+        help="AR Inference Backend",
+        required=False,
+    )
+    p.add_argument(
+        "--device",
+        "-d",
+        default=str(default_device),
+        help="Inference Device",
+        required=False,
+    )
+    p.add_argument(
+        "--port",
+        "-p",
+        default=9872,
+        type=int,
+        help="WebUI Binding Port",
+        required=False,
+    )
+    p.add_argument(
+        "--share",
+        "-s",
+        default=False,
+        action="store_true",
+        help="Gradio Share Link",
+        required=False,
+    )
+    p.add_argument(
+        "--cnhubert",
+        default="GPT_SoVITS/pretrained_models/chinese-hubert-base",
+        help="CNHuBERT Pretrain",
+        required=False,
+    )
+    p.add_argument(
+        "--bert",
+        default="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
+        help="BERT Pretrain",
+        required=False,
+    )
+    p.add_argument(
+        "--gpt",
+        default="",
+        help="GPT Model",
+        required=False,
+    )
+    p.add_argument(
+        "--sovits",
+        default="",
+        help="SoVITS Model",
+        required=False,
+    )
+    return p
+args = build_parser().parse_args()
+hps: Any = None
+vq_model: SynthesizerTrn | None = None
+t2s_engine: T2SEngineProtocol | None = None
+version = model_version = "v2"
+cnhubert_base_path = str(args.cnhubert)
+bert_path = str(args.bert)
+infer_ttswebui = int(args.port)
+is_share = bool(args.share)
+i18n = I18nAuto(language=args.language)
+ar_backend: str = args.backends
+change_choices_i18n = partial(change_choices, i18n=i18n)
+SoVITS_names, GPT_names = get_weights_names(i18n)
+dict_language_v1 = {
+    i18n("中文"): "all_zh",  # 全部按中文识别
+    i18n("英文"): "en",  # 全部按英文识别
+    i18n("日文"): "all_ja",  # 全部按日文识别
+    i18n("中英混合"): "zh",  # 按中英混合识别
+    i18n("日英混合"): "ja",  # 按日英混合识别
+    i18n("多语种混合"): "auto",  # 多语种启动切分识别语种
+}
+dict_language_v2 = {
+    i18n("中文"): "all_zh",  # 全部按中文识别
+    i18n("英文"): "en",  # 全部按英文识别
+    i18n("日文"): "all_ja",  # 全部按日文识别
+    i18n("粤语"): "all_yue",  # 全部按粤语识别
+    i18n("韩文"): "all_ko",  # 全部按韩文识别
+    i18n("中英混合"): "zh",
+    i18n("日英混合"): "ja",
+    i18n("粤英混合"): "yue",
+    i18n("韩英混合"): "ko",
+    i18n("多语种混合"): "auto",  # 多语种启动切分识别语种
+    i18n("多语种混合(粤语)"): "auto_yue",  # 多语种启动切分识别语种
+}
+dict_language = dict_language_v1 if version == "v1" else dict_language_v2
+punctuation = set(["!", "?", "…", ",", ".", "-", " "])
+splits = {"，", "。", "？", "！", ",", ".", "?", "!", "~", ":", "：", "—", "…"}
+v3v4set = {"v3", "v4"}
+infer_device = torch.device(args.device)
+device = infer_device if infer_device.type == "cuda" else torch.device("cpu")
+dtype = get_dtype(device.index)
+is_half = dtype == torch.float16
+tokenizer = AutoTokenizer.from_pretrained(bert_path)
+bert_model = AutoModelForMaskedLM.from_pretrained(bert_path).to(infer_device, dtype)
+cnhubert.cnhubert_base_path = cnhubert_base_path
+ssl_model = cnhubert.get_model().to(infer_device, dtype)
+spec_min = -12
+spec_max = 2
+def norm_spec(x):
+    return (x - spec_min) / (spec_max - spec_min) * 2 - 1
+def denorm_spec(x):
+    return (x + 1) / 2 * (spec_max - spec_min) + spec_min
+def mel_fn(x):
+    return mel_spectrogram_torch(
+        y=x,
+        n_fft=1024,
+        num_mels=100,
+        sampling_rate=24000,
+        hop_size=256,
+        win_size=1024,
+        fmin=0,
+        fmax=None,
+        center=False,
+    )
+def mel_fn_v4(x):
+    return mel_spectrogram_torch(
+        y=x,
+        n_fft=1280,
+        num_mels=100,
+        sampling_rate=32000,
+        hop_size=320,
+        win_size=1280,
+        fmin=0,
+        fmax=None,
+        center=False,
+    )
+gpt_path = str(args.gpt) or GPT_names[0][-1]
+sovits_path = str(args.sovits) or SoVITS_names[0][-1]
+def get_bert_feature(text, word2ph):
+    inputs = tokenizer(text, return_tensors="pt")
+    for i in inputs:
+        inputs[i] = inputs[i].to(infer_device)
+    res = bert_model(**inputs, output_hidden_states=True)
+    res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
+    assert len(word2ph) == len(text)
+    phone_level_feature = []
+    for i in range(len(word2ph)):
+        repeat_feature = res[i].repeat(word2ph[i], 1)
+        phone_level_feature.append(repeat_feature)
+    phone_level_feature_t = torch.cat(phone_level_feature, dim=0)
+    return phone_level_feature_t.T
+def change_sovits_weights(sovits_path, prompt_language=None, text_language=None):
+    global vq_model, hps, version, model_version, dict_language
+    model_version, version, is_lora, hps, dict_s2 = inspect_version(sovits_path)
+    print(sovits_path, version, model_version, is_lora)
+    dict_language = dict_language_v1 if version == "v1" else dict_language_v2
+    visible_sample_steps = visible_inp_refs = None
+    if prompt_language is not None and text_language is not None:
+        if prompt_language in list(dict_language.keys()):
+            prompt_text_update, prompt_language_update = gr.skip(), gr.update(choices=list(dict_language.keys()))
+        else:
+            prompt_text_update = gr.update(value="")
+            prompt_language_update = gr.update(value=i18n("中文"), choices=list(dict_language.keys()))
+        if text_language in list(dict_language.keys()):
+            text_update, text_language_update = gr.skip(), gr.skip()
+        else:
+            text_update = gr.update(value="")
+            text_language_update = gr.update(value=i18n("中文"), choices=list(dict_language.keys()))
+        if model_version in v3v4set:
+            visible_sample_steps = True
+            visible_inp_refs = False
+        else:
+            visible_sample_steps = False
+            visible_inp_refs = True
+        yield (
+            prompt_text_update,
+            prompt_language_update,
+            text_update,
+            text_language_update,
+            gr.update(
+                visible=visible_sample_steps,
+                value=32 if model_version == "v3" else 8,
+                choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
+            ),
+            gr.update(visible=visible_inp_refs),
+            gr.update(value=False, interactive=True if model_version not in v3v4set else False),
+            gr.update(visible=True if model_version == "v3" else False),
+            gr.update(value=i18n("模型加载中，请等待"), interactive=False),
+        )
+    hps = DictToAttrRecursive(hps)
+    hps.model.semantic_frame_rate = "25hz"
+    hps.model.version = model_version
+    if model_version not in v3v4set:
+        vq_model = SynthesizerTrn(
+            hps.data.filter_length // 2 + 1,
+            hps.train.segment_size // hps.data.hop_length,
+            n_speakers=hps.data.n_speakers,
+            **hps.model,
+        )
+    else:
+        raise RuntimeError("Unsupported model version")
+    if "pretrained" not in sovits_path:
+        if hasattr(vq_model, "enc_q"):
+            del vq_model.enc_q
+    if is_lora is False:
+        console.print(f">> loading sovits_{model_version}", vq_model.load_state_dict(dict_s2["weight"], strict=False))
+    else:
+        RuntimeError("Unsupported model version")
+    vq_model = vq_model.to(infer_device, dtype)
+    yield (
+        gr.skip(),
+        gr.skip(),
+        gr.skip(),
+        gr.skip(),
+        gr.skip(),
+        gr.skip(),
+        gr.skip(),
+        gr.skip(),
+        gr.update(value=i18n("合成语音"), interactive=True),
+    )
+with contextlib.suppress(UnboundLocalError):
+    next(change_sovits_weights(sovits_path))
+def change_gpt_weights(gpt_path):
+    global t2s_engine, config
+    t2s_engine = PyTorch.T2SEngineTorch(
+        PyTorch.T2SEngineTorch.load_decoder(Path(gpt_path), backend=ar_backend),
+        device,
+        dtype=dtype,
+    )
+    # t2s_engine.decoder_model.compile()
+    total = sum(p.numel() for p in t2s_engine.decoder_model.parameters())
+    console.print(">> Number of parameter: %.2fM" % (total / 1e6))
+change_gpt_weights(gpt_path)
+sv_cn_model = SV(infer_device, is_half)
+resample_transform_dict = {}
+def resample(audio_tensor, sr0, sr1, device):
+    global resample_transform_dict
+    key = f"{sr0}-{sr1}-{device}"
+    if key not in resample_transform_dict:
+        resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
+    return resample_transform_dict[key](audio_tensor)
+def get_spepc(hps, filename, dtype, device, is_v2pro=False):
+    sr1 = int(hps.data.sampling_rate)
+    audio, sr0 = torchaudio.load_with_torchcodec(filename)
+    audio = audio.to(device)
+    if sr0 != sr1:
+        audio = resample(audio, sr0, sr1, device)
+    if audio.shape[0] > 1:
+        audio = audio.mean(0).unsqueeze(0)
+    maxx = float(audio.abs().max())
+    if maxx > 1:
+        audio /= min(2, maxx)
+    spec = spectrogram_torch(
+        audio,
+        hps.data.filter_length,
+        hps.data.sampling_rate,
+        hps.data.hop_length,
+        hps.data.win_length,
+        center=False,
+    )
+    spec = spec.to(dtype)
+    if is_v2pro is True:
+        audio = resample(audio, sr1, 16000, device).to(dtype)
+    return spec, audio
+def clean_text_inf(text, language, version):
+    language = language.replace("all_", "")
+    phones, word2ph, norm_text = clean_text(text, language, version)
+    phones = cleaned_text_to_sequence(phones, version)
+    return phones, word2ph, norm_text
+def get_bert_inf(phones, word2ph, norm_text, language):
+    language = language.replace("all_", "")
+    if language == "zh":
+        bert = get_bert_feature(norm_text, word2ph).to(device)  # .to(dtype)
+    else:
+        bert = torch.zeros(
+            (1024, len(phones)),
+            dtype=torch.float16 if is_half is True else torch.float32,
+        ).to(device)
+    return bert
+def get_first(text):
+    pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
+    text = re.split(pattern, text)[0].strip()
+    return text
+def get_phones_and_bert(text, language, version, final=False):
+    text = re.sub(r" {2,}", " ", text)
+    textlist = []
+    langlist = []
+    if language == "all_zh":
+        for tmp in LangSegmenter.getTexts(text, "zh"):
+            langlist.append(tmp["lang"])
+            textlist.append(tmp["text"])
+    elif language == "all_yue":
+        for tmp in LangSegmenter.getTexts(text, "zh"):
+            if tmp["lang"] == "zh":
+                tmp["lang"] = "yue"
+            langlist.append(tmp["lang"])
+            textlist.append(tmp["text"])
+    elif language == "all_ja":
+        for tmp in LangSegmenter.getTexts(text, "ja"):
+            langlist.append(tmp["lang"])
+            textlist.append(tmp["text"])
+    elif language == "all_ko":
+        for tmp in LangSegmenter.getTexts(text, "ko"):
+            langlist.append(tmp["lang"])
+            textlist.append(tmp["text"])
+    elif language == "en":
+        langlist.append("en")
+        textlist.append(text)
+    elif language == "auto":
+        for tmp in LangSegmenter.getTexts(text):
+            langlist.append(tmp["lang"])
+            textlist.append(tmp["text"])
+    elif language == "auto_yue":
+        for tmp in LangSegmenter.getTexts(text):
+            if tmp["lang"] == "zh":
+                tmp["lang"] = "yue"
+            langlist.append(tmp["lang"])
+            textlist.append(tmp["text"])
+    else:
+        for tmp in LangSegmenter.getTexts(text):
+            if langlist:
+                if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
+                    textlist[-1] += tmp["text"]
+                    continue
+            if tmp["lang"] == "en":
+                langlist.append(tmp["lang"])
+            else:
+                # 因无法区别中日韩文汉字,以用户输入为准
+                langlist.append(language)
+            textlist.append(tmp["text"])
+    print(textlist)
+    print(langlist)
+    phones_list = []
+    bert_list = []
+    norm_text_list = []
+    for i in range(len(textlist)):
+        lang = langlist[i]
+        phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version)
+        bert = get_bert_inf(phones, word2ph, norm_text, lang)
+        phones_list.append(phones)
+        norm_text_list.append(norm_text)
+        bert_list.append(bert)
+    bert = torch.cat(bert_list, dim=1)
+    phones = sum(phones_list, [])
+    norm_text = "".join(norm_text_list)
+    if not final and len(phones) < 6:
+        return get_phones_and_bert("." + text, language, version, final=True)
+    return phones, bert.to(dtype), norm_text
+def merge_short_text_in_array(texts, threshold):
+    if (len(texts)) < 2:
+        return texts
+    result = []
+    text = ""
+    for ele in texts:
+        text += ele
+        if len(text) >= threshold:
+            result.append(text)
+            text = ""
+    if len(text) > 0:
+        if len(result) == 0:
+            result.append(text)
+        else:
+            result[len(result) - 1] += text
+    return result
+sr_model = None
+cache: dict[int, Any] = {}
+@spaces.GPU
+def get_tts_wav(
+    ref_wav_path,
+    prompt_text,
+    prompt_language,
+    text,
+    text_language,
+    how_to_cut=i18n("不切"),
+    top_k=20,
+    top_p=0.6,
+    temperature=0.6,
+    ref_free=False,
+    speed=1,
+    if_freeze=False,
+    inp_refs=None,
+    sample_steps=8,
+    if_sr=False,
+    pause_second=0.3,
+):
+    torch.set_grad_enabled(False)
+    ttfb_time = ttime()
+    if ref_wav_path:
+        pass
+    else:
+        gr.Warning(i18n("请上传参考音频"))
+    if text:
+        pass
+    else:
+        gr.Warning(i18n("请填入推理文本"))
+    t = []
+    if prompt_text is None or len(prompt_text) == 0:
+        ref_free = True
+    if model_version in v3v4set:
+        ref_free = False  # s2v3暂不支持ref_free
+    t0 = ttime()
+    prompt_language = dict_language[prompt_language]
+    text_language = dict_language[text_language]
+    if not ref_free:
+        prompt_text = prompt_text.strip("\n")
+        if prompt_text[-1] not in splits:
+            prompt_text += "。" if prompt_language != "en" else "."
+        print(">>", i18n("实际输入的参考文本:"), prompt_text)
+    text = text.strip("\n")
+    # if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
+    print(">>", i18n("实际输入的目标文本:"), text)
+    zero_wav = np.zeros(
+        int(hps.data.sampling_rate * pause_second),
+        dtype=np.float16 if is_half is True else np.float32,
+    )
+    zero_wav_torch = torch.from_numpy(zero_wav)
+    if is_half is True:
+        zero_wav_torch = zero_wav_torch.half().to(infer_device)
+    else:
+        zero_wav_torch = zero_wav_torch.to(infer_device)
+    if not ref_free:
+        assert vq_model
+        wav16k, sr = librosa.load(ref_wav_path, sr=16000)
+        if wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000:
+            gr.Warning(i18n("参考音频在3~10秒范围外，请更换！"))
+            raise OSError(i18n("参考音频在3~10秒范围外，请更换！"))
+        wav16k_t = torch.from_numpy(wav16k)
+        if is_half is True:
+            wav16k_t = wav16k_t.half().to(infer_device)
+        else:
+            wav16k_t = wav16k_t.to(infer_device)
+        wav16k_t = torch.cat([wav16k_t, zero_wav_torch])
+        ssl_content = ssl_model.model(wav16k_t.unsqueeze(0))["last_hidden_state"].transpose(1, 2)  # .float()
+        codes = vq_model.extract_latent(ssl_content)
+        prompt_semantic = codes[0, 0]
+        prompt = prompt_semantic.unsqueeze(0).to(device)
+    else:
+        prompt = torch.zeros((1, 0)).to(device, torch.int32)
+    t1 = ttime()
+    t.append(t1 - t0)
+    if how_to_cut == i18n("凑四句一切"):
+        text = cut1(text)
+    elif how_to_cut == i18n("凑50字一切"):
+        text = cut2(text)
+    elif how_to_cut == i18n("按中文句号。切"):
+        text = cut3(text)
+    elif how_to_cut == i18n("按英文句号.切"):
+        text = cut4(text)
+    elif how_to_cut == i18n("按标点符号切"):
+        text = cut5(text)
+    while "\n\n" in text:
+        text = text.replace("\n\n", "\n")
+    texts = text.split("\n")
+    texts = process_text(texts)
+    texts = merge_short_text_in_array(texts, 5)
+    audio_opt = []
+    # s2v3暂不支持ref_free
+    if not ref_free:
+        phones1, bert1, _ = get_phones_and_bert(prompt_text, prompt_language, version)
+    else:
+        phones1, bert1 = [], torch.zeros(1024, 0).to(device, dtype)
+    infer_len: list[int] = []
+    infer_time: list[float] = []
+    assert vq_model
+    for i_text, text in enumerate(texts):
+        # 解决输入目标文本的空行导致报错的问题
+        if len(text.strip()) == 0:
+            continue
+        if text[-1] not in splits:
+            text += "。" if text_language != "en" else "."
+        print(">>", i18n("实际输入的目标文本(每句):"), text)
+        phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language, version)
+        print(">>", i18n("前端处理后的文本(每句):"), norm_text2)
+        bert = torch.cat([bert1, bert2], 1)
+        all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
+        bert = bert.to(device).unsqueeze(0)
+        all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
+        t2 = ttime()
+        if i_text in cache and if_freeze is True:
+            pred_semantic = cache[i_text]
+        else:
+            t2s_request = T2SRequest(
+                [all_phoneme_ids.squeeze(0)],
+                all_phoneme_len,
+                prompt,
+                [bert.squeeze(0)],
+                valid_length=1,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                early_stop_num=1500,
+                use_cuda_graph=torch.cuda.is_available(),
+                # debug=True,
+            )
+            assert t2s_engine
+            t2s_result = t2s_engine.generate(t2s_request)
+            if t2s_result.exception is not None:
+                console.print(t2s_result.traceback)
+                raise RuntimeError()
+            pred_semantic_list = t2s_result.result
+            assert pred_semantic_list, t2s_result.traceback
+            pred_semantic = pred_semantic_list[0].unsqueeze(0).to(infer_device)
+            infer_len.append(pred_semantic.shape[-1])
+            infer_time.append(t2s_result.infer_speed[-1])
+            cache[i_text] = pred_semantic
+        t3 = ttime()
+        is_v2pro = model_version in {"v2Pro", "v2ProPlus"}
+        sv_emb: list[torch.Tensor] = []
+        if model_version not in v3v4set:
+            refers = []
+            if inp_refs:
+                for path in inp_refs:
+                    try:  # 这里加上提取sv的逻辑，要么一堆sv一堆refer，要么单个sv单个refer
+                        refer, audio_tensor = get_spepc(hps, path.name, dtype, infer_device, is_v2pro)
+                        refers.append(refer)
+                        if is_v2pro:
+                            assert sv_cn_model
+                            sv_emb.append(sv_cn_model.compute_embedding(audio_tensor))
+                    except Exception as e:
+                        print(e)
+                        traceback.print_exc()
+            if len(refers) == 0:
+                refers, audio_tensor = get_spepc(hps, ref_wav_path, dtype, infer_device, is_v2pro)
+                refers = [refers]
+                if is_v2pro:
+                    assert sv_cn_model
+                    sv_emb = [sv_cn_model.compute_embedding(audio_tensor)]
+            if is_v2pro:
+                audio = vq_model.decode(
+                    pred_semantic,
+                    torch.LongTensor(phones2).to(infer_device).unsqueeze(0),
+                    refers,
+                    speed=speed,
+                    sv_emb=sv_emb,
+                )[0][0]  # type: ignore
+            else:
+                audio = vq_model.decode(
+                    pred_semantic,
+                    torch.LongTensor(phones2).to(infer_device).unsqueeze(0),
+                    refers,
+                    speed=speed,
+                )[0][0]  # type: ignore
+        else:
+            raise RuntimeError("Unsupported model version")
+        if i_text == 0:
+            ttfb_time = ttime() - ttfb_time
+        max_audio = torch.abs(audio).max()  # 简单防止16bit爆音
+        if max_audio > 1:
+            audio = audio / max_audio
+        audio_opt.append(audio)
+        audio_opt.append(zero_wav_torch)  # zero_wav
+        t4 = ttime()
+        t.extend([t2 - t1, t3 - t2, t4 - t3])
+        t1 = ttime()
+    audio_opt_t = torch.cat(audio_opt, 0)  # np.concatenate
+    opt_sr = 32000
+    audio_opt_n = audio_opt_t.cpu().numpy()
+    t0 = t[0]
+    t1 = sum(t[1::3])
+    t2 = sum(t[2::3])
+    t3 = sum(t[3::3])
+    infer_speed_avg = sum(infer_len) / sum(infer_time)
+    rtf_value = sum(t) / (audio_opt_n.__len__() / opt_sr)
+    console.print(f">> Time Stamps: {t0:.3f}\t{t1:.3f}\t{t2:.3f}\t{t3:.3f}")
+    console.print(f">> Infer Speed: {infer_speed_avg:.2f} Token/s")
+    console.print(f">> RTF: {rtf_value:.2f}")
+    if ttfb_time > 2:
+        console.print(f">> TTFB: {ttfb_time:.3f} s")
+    else:
+        console.print(f">> TTFB: {ttfb_time * 1000:.3f} ms")
+    gr.Info(f"{infer_speed_avg:.2f} Token/s", title="Infer Speed")
+    gr.Info(f"{rtf_value:.2f}", title="RTF")
+    if ttfb_time > 2:
+        gr.Info(f">> TTFB: {ttfb_time:.3f} s")
+    else:
+        gr.Info(f">> TTFB: {ttfb_time * 1000:.3f} ms")
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    yield opt_sr, (audio_opt_n * 32767).astype(np.int16)
+def split(todo_text):
+    todo_text = todo_text.replace("……", "。").replace("——", "，")
+    if todo_text[-1] not in splits:
+        todo_text += "。"
+    i_split_head = i_split_tail = 0
+    len_text = len(todo_text)
+    todo_texts = []
+    while 1:
+        if i_split_head >= len_text:
+            break  # 结尾一定有标点，所以直接跳出即可，最后一段在上次已加入
+        if todo_text[i_split_head] in splits:
+            i_split_head += 1
+            todo_texts.append(todo_text[i_split_tail:i_split_head])
+            i_split_tail = i_split_head
+        else:
+            i_split_head += 1
+    return todo_texts
+def cut1(inp):
+    inp = inp.strip("\n")
+    inps = split(inp)
+    split_idx: list[int | None] = list(range(0, len(inps) + 1, 4))
+    split_idx[-1] = None
+    if len(split_idx) > 1:
+        opts = []
+        for idx in range(len(split_idx) - 1):
+            opts.append("".join(inps[split_idx[idx] : split_idx[idx + 1]]))
+    else:
+        opts = [inp]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
+    return "\n".join(opts)
+def cut2(inp):
+    inp = inp.strip("\n")
+    inps = split(inp)
+    if len(inps) < 2:
+        return inp
+    opts = []
+    summ = 0
+    tmp_str = ""
+    for i in range(len(inps)):
+        summ += len(inps[i])
+        tmp_str += inps[i]
+        if summ > 50:
+            summ = 0
+            opts.append(tmp_str)
+            tmp_str = ""
+    if tmp_str != "":
+        opts.append(tmp_str)
+    if len(opts) > 1 and len(opts[-1]) < 50:  # 如果最后一个太短了，和前一个合一起
+        opts[-2] = opts[-2] + opts[-1]
+        opts = opts[:-1]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
+    return "\n".join(opts)
+def cut3(inp):
+    inp = inp.strip("\n")
+    opts = inp.strip("。").split("。")
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
+    return "\n".join(opts)
+def cut4(inp):
+    inp = inp.strip("\n")
+    opts = re.split(r"(?<!\d)\.(?!\d)", inp.strip("."))
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
+    return "\n".join(opts)
+# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
+def cut5(inp):
+    inp = inp.strip("\n")
+    punds = {",", ".", ";", "?", "!", "、", "，", "。", "？", "！", ";", "：", "…"}
+    mergeitems = []
+    items = []
+    for i, char in enumerate(inp):
+        if char in punds:
+            if char == "." and i > 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit():
+                items.append(char)
+            else:
+                items.append(char)
+                mergeitems.append("".join(items))
+                items = []
+        else:
+            items.append(char)
+    if items:
+        mergeitems.append("".join(items))
+    opt = [item for item in mergeitems if not set(item).issubset(punds)]
+    return "\n".join(opt)
+def process_text(texts):
+    _text = []
+    if all(text in [None, " ", "\n", ""] for text in texts):
+        raise ValueError(i18n("请输入有效文本"))
+    for text in texts:
+        if text in [None, " ", ""]:
+            pass
+        else:
+            _text.append(text)
+    return _text
+def html_center(text, label="p"):
+    return f"""<div style="text-align: center; margin: 100; padding: 50;">
+                <{label} style="margin: 0; padding: 0;">{text}</{label}>
+                </div>"""
+def html_left(text, label="p"):
+    return f"""<div style="text-align: left; margin: 0; padding: 0;">
+                <{label} style="margin: 0; padding: 0;">{text}</{label}>
+                </div>"""
+with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css) as app:
+    gr.HTML(
+        top_html.format(
+            i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
+            + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
+        ),
+        elem_classes="markdown",
+    )
+    gr.Markdown(html_center(i18n("模型切换"), "h3"))
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=2):
+            with gr.Row(equal_height=True):
+                GPT_dropdown = gr.Dropdown(
+                    label=i18n("GPT模型列表"),
+                    choices=GPT_names,
+                    value=gpt_path,
+                    interactive=True,
+                )
+                SoVITS_dropdown = gr.Dropdown(
+                    label=i18n("SoVITS模型列表"),
+                    choices=SoVITS_names,
+                    value=sovits_path,
+                    interactive=True,
+                )
+        with gr.Column(scale=1):
+            refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary", scale=14)
+        refresh_button.click(fn=change_choices_i18n, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
+    gr.Markdown(html_center(i18n("*请上传并填写参考信息"), "h3"))
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=2):
+            with gr.Row(equal_height=True):
+                with gr.Column(scale=1):
+                    inp_ref = gr.Audio(
+                        label=i18n("请上传3~10秒内参考音频，超过会报错！"),
+                        type="filepath",
+                        sources="upload",
+                        scale=13,
+                        editable=False,
+                        waveform_options={"show_recording_waveform": False},
+                    )
+                with gr.Column(scale=1):
+                    gr.Markdown(
+                        html_center(
+                            i18n("使用无参考文本模式时建议使用微调的GPT")
+                            + "<br>"
+                            + i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。")
+                        )
+                    )
+                    ref_text_free = gr.Checkbox(
+                        label=i18n("开启无参考文本模式"),
+                        info=i18n("不填参考文本亦相当于开启") + ", " + i18n("v3暂不支持该模式，使用了会报错。"),
+                        value=False,
+                        interactive=True if model_version not in v3v4set else False,
+                        show_label=True,
+                        scale=1,
+                    )
+                    prompt_language = gr.Dropdown(
+                        label="",
+                        info=i18n("参考音频的语种"),
+                        choices=list(dict_language.keys()),
+                        value=i18n("中文"),
+                    )
+                    prompt_text = gr.Textbox(label="", info=i18n("参考音频的文本"), value="", lines=3, max_lines=3)
+        with gr.Column(scale=1):
+            inp_refs = (
+                gr.File(
+                    label=i18n(
+                        "可选项：通过拖拽多个文件上传多个参考音频（建议同性），平均融合他们的音色。如不填写此项，音色由左侧单个参考音频控制。如是微调模型，建议参考音频全部在微调训练集音色内，底模不用管。"
+                    ),
+                    file_count="multiple",
+                )
+                if model_version not in v3v4set
+                else gr.File(
+                    label=i18n(
+                        "可选项：通过拖拽多个文件上传多个参考音频（建议同性），平均融合他们的音色。如不填写此项，音色由左侧单个参考音频控制。如是微调模型，建议参考音频全部在微调训练集音色内，底模不用管。"
+                    ),
+                    file_count="multiple",
+                    visible=False,
+                )
+            )
+            sample_steps = (
+                gr.Radio(
+                    label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),
+                    value=32 if model_version == "v3" else 8,
+                    choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
+                    visible=True,
+                )
+                if model_version in v3v4set
+                else gr.Radio(
+                    label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),
+                    choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
+                    visible=False,
+                    value=32 if model_version == "v3" else 8,
+                )
+            )
+            if_sr_Checkbox = gr.Checkbox(
+                label=i18n("v3输出如果觉得闷可以试试开超分"),
+                value=False,
+                interactive=True,
+                show_label=True,
+                visible=False if model_version != "v3" else True,
+            )
+    gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3"))
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=2):
+            text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=30, max_lines=40)
+        with gr.Column(scale=1):
+            text_language = gr.Dropdown(
+                label=i18n("需要合成的语种") + i18n(".限制范围越小判别效果越好。"),
+                choices=list(dict_language.keys()),
+                value=i18n("中文"),
+                scale=1,
+            )
+            how_to_cut = gr.Dropdown(
+                label=i18n("怎么切"),
+                choices=[
+                    i18n("不切"),
+                    i18n("凑四句一切"),
+                    i18n("凑50字一切"),
+                    i18n("按中文句号。切"),
+                    i18n("按英文句号.切"),
+                    i18n("按标点符号切"),
+                ],
+                value=i18n("凑四句一切"),
+                interactive=True,
+                scale=1,
+            )
+            if_freeze = gr.Checkbox(
+                label=i18n("是否直接对上次合成结果调整语速和音色"),
+                value=False,
+                interactive=True,
+                show_label=True,
+                scale=1,
+            )
+            with gr.Row(equal_height=True):
+                speed = gr.Slider(
+                    minimum=0.6, maximum=1.65, step=0.05, label=i18n("语速"), value=1, interactive=True, scale=1
+                )
+                pause_second_slider = gr.Slider(
+                    minimum=0.1,
+                    maximum=0.5,
+                    step=0.01,
+                    label=i18n("句间停顿秒数"),
+                    value=0.3,
+                    interactive=True,
+                    scale=1,
+                )
+            gr.Markdown(html_center(i18n("GPT采样参数(不懂就用默认):")))
+            top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=15, interactive=True, scale=1)
+            top_p = gr.Slider(minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True, scale=1)
+            temperature = gr.Slider(
+                minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True, scale=1
+            )
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=2):
+            inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg")
+        with gr.Column(scale=1):
+            output = gr.Audio(
+                label=i18n("输出的语音"),
+                waveform_options={"show_recording_waveform": False},
+                editable=False,
+            )
+    inference_button.click(
+        get_tts_wav,
+        [
+            inp_ref,
+            prompt_text,
+            prompt_language,
+            text,
+            text_language,
+            how_to_cut,
+            top_k,
+            top_p,
+            temperature,
+            ref_text_free,
+            speed,
+            if_freeze,
+            inp_refs,
+            sample_steps,
+            if_sr_Checkbox,
+            pause_second_slider,
+        ],
+        [output],
+    )
+    SoVITS_dropdown.change(
+        change_sovits_weights,
+        [SoVITS_dropdown, prompt_language, text_language],
+        [
+            prompt_text,
+            prompt_language,
+            text,
+            text_language,
+            sample_steps,
+            inp_refs,
+            ref_text_free,
+            if_sr_Checkbox,
+            inference_button,
+        ],
+    )
+    GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
+if __name__ == "__main__":
+    app.queue(api_open=False, default_concurrency_limit=1, max_size=1024).launch()

GPT_SoVITS/module/attentions.py ADDED Viewed

	@@ -0,0 +1,658 @@

+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrizations import weight_norm
+from . import commons
+from .modules import LayerNorm
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        window_size=4,
+        isflow=False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    window_size=window_size,
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+        if isflow:
+            cond_layer = torch.nn.Conv1d(kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1)
+            self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1)
+            self.cond_layer = weight_norm_modules(cond_layer, name="weight")
+            self.gin_channels = kwargs["gin_channels"]
+    def forward(self, x, x_mask, g=None):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        if g is not None:
+            g = self.cond_layer(g)
+        for i in range(self.n_layers):
+            if g is not None:
+                x = self.cond_pre(x)
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
+                x = commons.fused_add_tanh_sigmoid_multiply(x, g_l, torch.IntTensor([self.hidden_channels]))
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        proximal_bias=False,
+        proximal_init=True,
+        **kwargs,
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.drop = nn.Dropout(p_dropout)
+        self.self_attn_layers = nn.ModuleList()
+        self.norm_layers_0 = nn.ModuleList()
+        self.encdec_attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.self_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    proximal_bias=proximal_bias,
+                    proximal_init=proximal_init,
+                )
+            )
+            self.norm_layers_0.append(LayerNorm(hidden_channels))
+            self.encdec_attn_layers.append(
+                MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                    causal=True,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask, h, h_mask):
+        """
+        x: decoder input
+        h: encoder output
+        """
+        self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
+        encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.self_attn_layers[i](x, x, self_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_0[i](x + y)
+            y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels,
+        out_channels,
+        n_heads,
+        p_dropout=0.0,
+        window_size=None,
+        heads_share=True,
+        block_length=None,
+        proximal_bias=False,
+        proximal_init=False,
+    ):
+        super().__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert t_s == t_t, "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert t_s == t_t, "Local attention is only available for self-attention."
+                block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
+                scores = scores.masked_fill(block_mask == 0, -1e4)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
+            output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
+        output = output.transpose(2, 3).contiguous().view(b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        """
+        x: [b, h, l, m]
+        y: [h or 1, m, d]
+        ret: [b, h, l, d]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        """
+        x: [b, h, l, d]
+        y: [h or 1, m, d]
+        ret: [b, h, l, m]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+            )
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        """
+        x: [b, h, l, 2*l-1]
+        ret: [b, h, l, l]
+        """
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        """
+        x: [b, h, l, l]
+        ret: [b, h, l, 2*l-1]
+        """
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
+        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        """Bias for self-attention to encourage attention to close positions.
+        Args:
+          length: an integer scalar.
+        Returns:
+          a Tensor with shape [1, 1, length, length]
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout=0.0,
+        activation=None,
+        causal=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+        if causal:
+            self.padding = self._causal_padding
+        else:
+            self.padding = self._same_padding
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x * x_mask))
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(self.padding(x * x_mask))
+        return x * x_mask
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
+class Depthwise_Separable_Conv1D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+        padding_mode="zeros",  # TODO: refine this type
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        self.depth_conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=kernel_size,
+            groups=in_channels,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+        self.point_conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            bias=bias,
+            device=device,
+            dtype=dtype,
+        )
+    def forward(self, input):
+        return self.point_conv(self.depth_conv(input))
+    def weight_norm(self):
+        self.depth_conv = weight_norm(self.depth_conv, name="weight")
+        self.point_conv = weight_norm(self.point_conv, name="weight")
+    def remove_weight_norm(self):
+        self.depth_conv = remove_weight_norm(self.depth_conv, name="weight")
+        self.point_conv = remove_weight_norm(self.point_conv, name="weight")
+class Depthwise_Separable_TransposeConv1D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        bias=True,
+        dilation=1,
+        padding_mode="zeros",  # TODO: refine this type
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        self.depth_conv = nn.ConvTranspose1d(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=kernel_size,
+            groups=in_channels,
+            stride=stride,
+            output_padding=output_padding,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+        self.point_conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            bias=bias,
+            device=device,
+            dtype=dtype,
+        )
+    def forward(self, input):
+        return self.point_conv(self.depth_conv(input))
+    def weight_norm(self):
+        self.depth_conv = weight_norm(self.depth_conv, name="weight")
+        self.point_conv = weight_norm(self.point_conv, name="weight")
+    def remove_weight_norm(self):
+        remove_weight_norm(self.depth_conv, name="weight")
+        remove_weight_norm(self.point_conv, name="weight")
+def weight_norm_modules(module, name="weight", dim=0):
+    if isinstance(module, Depthwise_Separable_Conv1D) or isinstance(module, Depthwise_Separable_TransposeConv1D):
+        module.weight_norm()
+        return module
+    else:
+        return weight_norm(module, name, dim)
+def remove_weight_norm_modules(module, name="weight"):
+    if isinstance(module, Depthwise_Separable_Conv1D) or isinstance(module, Depthwise_Separable_TransposeConv1D):
+        module.remove_weight_norm()
+    else:
+        remove_weight_norm(module, name)
+class FFT(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers=1,
+        kernel_size=1,
+        p_dropout=0.0,
+        proximal_bias=False,
+        proximal_init=True,
+        isflow=False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        if isflow:
+            cond_layer = torch.nn.Conv1d(kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1)
+            self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1)
+            self.cond_layer = weight_norm_modules(cond_layer, name="weight")
+            self.gin_channels = kwargs["gin_channels"]
+        self.drop = nn.Dropout(p_dropout)
+        self.self_attn_layers = nn.ModuleList()
+        self.norm_layers_0 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.self_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    proximal_bias=proximal_bias,
+                    proximal_init=proximal_init,
+                )
+            )
+            self.norm_layers_0.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                    causal=True,
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask, g=None):
+        """
+        x: decoder input
+        h: encoder output
+        """
+        if g is not None:
+            g = self.cond_layer(g)
+        self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            if g is not None:
+                x = self.cond_pre(x)
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
+                x = commons.fused_add_tanh_sigmoid_multiply(x, g_l, torch.IntTensor([self.hidden_channels]))
+            y = self.self_attn_layers[i](x, x, self_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_0[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+        x = x * x_mask
+        return x
+class TransformerCouplingLayer(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        n_layers,
+        n_heads,
+        p_dropout=0,
+        filter_channels=0,
+        mean_only=False,
+        wn_sharing_parameter=None,
+        gin_channels=0,
+    ):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = (
+            Encoder(
+                hidden_channels,
+                filter_channels,
+                n_heads,
+                n_layers,
+                kernel_size,
+                p_dropout,
+                isflow=True,
+                gin_channels=gin_channels,
+            )
+            if wn_sharing_parameter is None
+            else wn_sharing_parameter
+        )
+        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0) * x_mask
+        h = self.enc(h, x_mask, g=g)
+        stats = self.post(h) * x_mask
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+        if not reverse:
+            x1 = m + x1 * torch.exp(logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            return x

GPT_SoVITS/module/attentions_onnx.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import math
+from typing import Optional
+import torch
+from torch import nn
+from torch.nn import functional as F
+from . import commons
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        window_size=4,
+        isflow=True,
+        **kwargs,
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        # if isflow:
+        #  cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
+        #  self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
+        #  self.cond_layer = weight_norm(cond_layer, name='weight')
+        #  self.gin_channels = 256
+        self.cond_layer_idx = self.n_layers
+        self.spk_emb_linear = nn.Linear(256, self.hidden_channels)
+        if "gin_channels" in kwargs:
+            self.gin_channels = kwargs["gin_channels"]
+            if self.gin_channels != 0:
+                self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
+                # vits2 says 3rd block, so idx is 2 by default
+                self.cond_layer_idx = kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
+                logging.debug(self.gin_channels, self.cond_layer_idx)
+                assert self.cond_layer_idx < self.n_layers, "cond_layer_idx should be less than n_layers"
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    window_size=window_size,
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    # def forward(self, x, x_mask, g=None):
+    #     attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+    #     x = x * x_mask
+    #     for i in range(self.n_layers):
+    #         if i == self.cond_layer_idx and g is not None:
+    #             g = self.spk_emb_linear(g.transpose(1, 2))
+    #             g = g.transpose(1, 2)
+    #             x = x + g
+    #             x = x * x_mask
+    #         y = self.attn_layers[i](x, x, attn_mask)
+    #         y = self.drop(y)
+    #         x = self.norm_layers_1[i](x + y)
+    #         y = self.ffn_layers[i](x, x_mask)
+    #         y = self.drop(y)
+    #         x = self.norm_layers_2[i](x + y)
+    #     x = x * x_mask
+    #     return x
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zip(
+            self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2
+        ):
+            y = attn_layers(x, x, attn_mask)
+            y = self.drop(y)
+            x = norm_layers_1(x + y)
+            y = ffn_layers(x, x_mask)
+            y = self.drop(y)
+            x = norm_layers_2(x + y)
+        x = x * x_mask
+        return x
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels,
+        out_channels,
+        n_heads,
+        p_dropout=0.0,
+        window_size=None,
+        heads_share=True,
+        block_length=None,
+        proximal_bias=False,
+        proximal_init=False,
+    ):
+        super().__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+    def forward(self, x, c, attn_mask: Optional[torch.Tensor] = None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        # x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x, _ = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask: Optional[torch.Tensor] = None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, _ = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3)
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+        p_attn = F.softmax(scores, dim=-1)
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
+            output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
+        output = output.transpose(2, 3).contiguous().view(b, d, -1)
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        """
+        x: [b, h, l, m]
+        y: [h or 1, m, d]
+        ret: [b, h, l, d]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        """
+        x: [b, h, l, d]
+        y: [h or 1, m, d]
+        ret: [b, h, l, m]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_l = torch.zeros((1), dtype=torch.int64) + length - (self.window_size + 1)
+        pad_s = torch.zeros((1), dtype=torch.int64) + (self.window_size + 1) - length
+        pad_length = torch.max(pad_l, other=torch.zeros((1), dtype=torch.int64))
+        slice_start_position = torch.max(pad_s, other=torch.zeros((1), dtype=torch.int64))
+        slice_end_position = slice_start_position + 2 * length - 1
+        padded_relative_embeddings = F.pad(
+            relative_embeddings,
+            commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+        )
+        used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        """
+        x: [b, h, l, 2*l-1]
+        ret: [b, h, l, l]
+        """
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        """
+        x: [b, h, l, l]
+        ret: [b, h, l, 2*l-1]
+        """
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
+        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        """Bias for self-attention to encourage attention to close positions.
+        Args:
+          length: an integer scalar.
+        Returns:
+          a Tensor with shape [1, 1, length, length]
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout=0.0,
+        activation="",
+        causal=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+        # 从上下文看这里一定是 False
+        # if causal:
+        #     self.padding = self._causal_padding
+        # else:
+        #     self.padding = self._same_padding
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x * x_mask))
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(self.padding(x * x_mask))
+        return x * x_mask
+    def padding(self, x):
+        return self._same_padding(x)
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
+class MRTE(nn.Module):
+    def __init__(
+        self,
+        content_enc_channels=192,
+        hidden_size=512,
+        out_channels=192,
+        kernel_size=5,
+        n_heads=4,
+        ge_layer=2,
+    ):
+        super(MRTE, self).__init__()
+        self.cross_attention = MultiHeadAttention(hidden_size, hidden_size, n_heads)
+        self.c_pre = nn.Conv1d(content_enc_channels, hidden_size, 1)
+        self.text_pre = nn.Conv1d(content_enc_channels, hidden_size, 1)
+        self.c_post = nn.Conv1d(hidden_size, out_channels, 1)
+    def forward(self, ssl_enc, ssl_mask, text, text_mask, ge):
+        attn_mask = text_mask.unsqueeze(2) * ssl_mask.unsqueeze(-1)
+        ssl_enc = self.c_pre(ssl_enc * ssl_mask)
+        text_enc = self.text_pre(text * text_mask)
+        x = self.cross_attention(ssl_enc * ssl_mask, text_enc * text_mask, attn_mask) + ssl_enc + ge
+        x = self.c_post(x * ssl_mask)
+        return x

GPT_SoVITS/module/commons.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import math
+import torch
+from torch.nn import functional as F
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+# def convert_pad_shape(pad_shape):
+#     l = pad_shape[::-1]
+#     pad_shape = [item for sublist in l for item in sublist]
+#     return pad_shape
+def intersperse(lst, item):
+    result = [item] * (len(lst) * 2 + 1)
+    result[1::2] = lst
+    return result
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+    """KL(P||Q)"""
+    kl = (logs_q - logs_p) - 0.5
+    kl += 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
+    return kl
+def rand_gumbel(shape):
+    """Sample from the Gumbel distribution, protect from overflows."""
+    uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+    return -torch.log(-torch.log(uniform_samples))
+def rand_gumbel_like(x):
+    g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+    return g
+def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size + 1
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
+    position = torch.arange(length, dtype=torch.float)
+    num_timescales = channels // 2
+    log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (num_timescales - 1)
+    inv_timescales = min_timescale * torch.exp(
+        torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
+    )
+    scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+    signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+    signal = F.pad(signal, [0, 0, 0, channels % 2])
+    signal = signal.view(1, channels, length)
+    return signal
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return x + signal.to(dtype=x.dtype, device=x.device)
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+def subsequent_mask(length):
+    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+    return mask
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+def shift_1d(x):
+    x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+    return x
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def generate_path(duration, mask):
+    """
+    duration: [b, 1, t_x]
+    mask: [b, 1, t_y, t_x]
+    """
+    device = duration.device
+    b, _, t_y, t_x = mask.shape
+    cum_duration = torch.cumsum(duration, -1)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path.unsqueeze(1).transpose(2, 3) * mask
+    return path
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    if clip_value is not None:
+        clip_value = float(clip_value)
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+        if clip_value is not None:
+            p.grad.data.clamp_(min=-clip_value, max=clip_value)
+    total_norm = total_norm ** (1.0 / norm_type)
+    return total_norm
+def squeeze(x, x_mask=None, n_sqz=2):
+    b, c, t = x.size()
+    t = (t // n_sqz) * n_sqz
+    x = x[:, :, :t]
+    x_sqz = x.view(b, c, t // n_sqz, n_sqz)
+    x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz)
+    if x_mask is not None:
+        x_mask = x_mask[:, :, n_sqz - 1 :: n_sqz]
+    else:
+        x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype)
+    return x_sqz * x_mask, x_mask
+def unsqueeze(x, x_mask=None, n_sqz=2):
+    b, c, t = x.size()
+    x_unsqz = x.view(b, n_sqz, c // n_sqz, t)
+    x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz)
+    if x_mask is not None:
+        x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz)
+    else:
+        x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype)
+    return x_unsqz * x_mask, x_mask

GPT_SoVITS/module/core_vq.py ADDED Viewed

	@@ -0,0 +1,365 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# This implementation is inspired from
+# https://github.com/lucidrains/vector-quantize-pytorch
+# which is released under MIT License. Hereafter, the original license:
+# MIT License
+#
+# Copyright (c) 2020 Phil Wang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Core vector quantization implementation."""
+import typing as tp
+from einops import rearrange, repeat
+import torch
+from torch import nn
+import torch.nn.functional as F
+from tqdm import tqdm
+def default(val: tp.Any, d: tp.Any) -> tp.Any:
+    return val if val is not None else d
+def ema_inplace(moving_avg, new, decay: float):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
+    return (x + epsilon) / (x.sum() + n_categories * epsilon)
+def uniform_init(*shape: int):
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+def sample_vectors(samples, num: int):
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+    return samples[indices]
+def kmeans(samples, num_clusters: int, num_iters: int = 10):
+    dim, dtype = samples.shape[-1], samples.dtype
+    max_kmeans_samples = 500
+    samples = samples[:max_kmeans_samples, :]
+    means = sample_vectors(samples, num_clusters)
+    print("kmeans start ... ")
+    for _ in tqdm(range(num_iters)):
+        diffs = rearrange(samples, "n d -> n () d") - rearrange(means, "c d -> () c d")
+        dists = -(diffs**2).sum(dim=-1)
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples)
+        new_means = new_means / bins_min_clamped[..., None]
+        means = torch.where(zero_mask[..., None], means, new_means)
+    return means, bins
+class EuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance.
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
+            If set to true, run the k-means algorithm on the first training batch and use
+            the learned centroids as initialization.
+        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        kmeans_init: int = False,
+        kmeans_iters: int = 10,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.decay = decay
+        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
+        embed = init_fn(codebook_size, dim)
+        self.codebook_size = codebook_size
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
+        self.register_buffer("cluster_size", torch.zeros(codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.inited:
+            return
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+        # Make sure all buffers across workers are in sync after initialization
+        # broadcast_tensors(self.buffers())
+    def replace_(self, samples, mask):
+        modified_codebook = torch.where(mask[..., None], sample_vectors(samples, self.codebook_size), self.embed)
+        self.embed.data.copy_(modified_codebook)
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+        expired_codes = self.cluster_size < self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+        batch_samples = rearrange(batch_samples, "... d -> (...) d")
+        self.replace_(batch_samples, mask=expired_codes)
+        # broadcast_tensors(self.buffers())
+    def preprocess(self, x):
+        x = rearrange(x, "... d -> (...) d")
+        return x
+    def quantize(self, x):
+        embed = self.embed.t()
+        dist = -(x.pow(2).sum(1, keepdim=True) - 2 * x @ embed + embed.pow(2).sum(0, keepdim=True))
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+    def encode(self, x):
+        shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        embed_ind = self.quantize(x)
+        # post-process
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+    def decode(self, embed_ind):
+        quantize = self.dequantize(embed_ind)
+        return quantize
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        x = self.preprocess(x)
+        self.init_embed_(x)
+        embed_ind = self.quantize(x)
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        quantize = self.dequantize(embed_ind)
+        if self.training:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            self.expire_codes_(x)
+            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = x.t() @ embed_onehot
+            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
+            cluster_size = (
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon) * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+        return quantize, embed_ind
+class VectorQuantization(nn.Module):
+    """Vector quantization implementation.
+    Currently supports only euclidean distance.
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+        commitment_weight (float): Weight for commitment loss.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: tp.Optional[int] = None,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 50,
+        threshold_ema_dead_code: int = 2,
+        commitment_weight: float = 1.0,
+    ):
+        super().__init__()
+        _codebook_dim: int = default(codebook_dim, dim)
+        requires_projection = _codebook_dim != dim
+        self.project_in = nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity()
+        self.project_out = nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity()
+        self.epsilon = epsilon
+        self.commitment_weight = commitment_weight
+        self._codebook = EuclideanCodebook(
+            dim=_codebook_dim,
+            codebook_size=codebook_size,
+            kmeans_init=kmeans_init,
+            kmeans_iters=kmeans_iters,
+            decay=decay,
+            epsilon=epsilon,
+            threshold_ema_dead_code=threshold_ema_dead_code,
+        )
+        self.codebook_size = codebook_size
+    @property
+    def codebook(self):
+        return self._codebook.embed
+    def encode(self, x):
+        x = rearrange(x, "b d n -> b n d")
+        x = self.project_in(x)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+    def decode(self, embed_ind):
+        quantize = self._codebook.decode(embed_ind)
+        quantize = self.project_out(quantize)
+        quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize
+    def forward(self, x):
+        device = x.device
+        x = rearrange(x, "b d n -> b n d")
+        x = self.project_in(x)
+        quantize, embed_ind = self._codebook(x)
+        if self.training:
+            quantize = x + (quantize - x).detach()
+        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+        if self.training:
+            if self.commitment_weight > 0:
+                commit_loss = F.mse_loss(quantize.detach(), x)
+                loss = loss + commit_loss * self.commitment_weight
+        quantize = self.project_out(quantize)
+        quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize, embed_ind, loss
+class ResidualVectorQuantization(nn.Module):
+    """Residual vector quantization implementation.
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    """
+    def __init__(self, *, num_quantizers, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList([VectorQuantization(**kwargs) for _ in range(num_quantizers)])
+    def forward(self, x, n_q: tp.Optional[int] = None, layers: tp.Optional[list] = None):
+        quantized_out = 0.0
+        residual = x
+        all_losses = []
+        all_indices = []
+        out_quantized = []
+        n_q = n_q or len(self.layers)
+        for i, layer in enumerate(self.layers[:n_q]):
+            quantized, indices, loss = layer(residual)
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.append(indices)
+            all_losses.append(loss)
+            if layers and i in layers:
+                out_quantized.append(quantized)
+        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
+        return quantized_out, out_indices, out_losses, out_quantized
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None) -> torch.Tensor:
+        residual = x
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        st = st or 0
+        for layer in self.layers[st:n_q]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+    def decode(self, q_indices: torch.Tensor, st: int = 0) -> torch.Tensor:
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[st + i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out

GPT_SoVITS/module/data_utils.py ADDED Viewed

	@@ -0,0 +1,1073 @@

+import os
+import random
+import traceback
+import torch
+import torch.nn.functional as F
+import torch.utils.data
+from tqdm import tqdm
+from GPT_SoVITS.text import cleaned_text_to_sequence
+from tools.my_utils import load_audio
+from .mel_processing import spec_to_mel_torch, spectrogram_torch
+version = os.environ.get("version", None)
+# ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
+class TextAudioSpeakerLoader(torch.utils.data.Dataset):
+    """
+    1) loads audio, speaker_id, text pairs
+    2) normalizes text and converts them to sequences of integers
+    3) computes spectrograms from audio files.
+    """
+    def __init__(self, hparams, version=None, val=False):
+        exp_dir = hparams.exp_dir
+        self.path2 = "%s/2-name2text.txt" % exp_dir
+        self.path4 = "%s/4-cnhubert" % exp_dir
+        self.path5 = "%s/5-wav32k" % exp_dir
+        assert os.path.exists(self.path2)
+        assert os.path.exists(self.path4)
+        assert os.path.exists(self.path5)
+        self.is_v2Pro = version in {"v2Pro", "v2ProPlus"}
+        if self.is_v2Pro:
+            self.path7 = "%s/7-sv_cn" % exp_dir
+            assert os.path.exists(self.path7)
+        names4 = set([name[:-3] for name in list(os.listdir(self.path4))])  # 去除.pt后缀
+        names5 = set(os.listdir(self.path5))
+        if self.is_v2Pro:
+            names6 = set([name[:-3] for name in list(os.listdir(self.path7))])  # 去除.pt后缀
+        self.phoneme_data = {}
+        with open(self.path2, "r", encoding="utf8") as f:
+            lines = f.read().strip("\n").split("\n")
+        for line in lines:
+            tmp = line.split("\t")
+            if len(tmp) != 4:
+                continue
+            self.phoneme_data[tmp[0]] = [tmp[1]]
+        if self.is_v2Pro:
+            self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5 & names6)
+        else:
+            self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5)
+        tmp = self.audiopaths_sid_text
+        leng = len(tmp)
+        min_num = 100
+        if leng < min_num:
+            self.audiopaths_sid_text = []
+            for _ in range(max(2, int(min_num / leng))):
+                self.audiopaths_sid_text += tmp
+        self.max_wav_value = hparams.max_wav_value
+        self.sampling_rate = hparams.sampling_rate
+        self.filter_length = hparams.filter_length
+        self.hop_length = hparams.hop_length
+        self.win_length = hparams.win_length
+        self.sampling_rate = hparams.sampling_rate
+        self.val = val
+        random.seed(1234)
+        random.shuffle(self.audiopaths_sid_text)
+        print("phoneme_data_len:", len(self.phoneme_data.keys()))
+        print("wav_data_len:", len(self.audiopaths_sid_text))
+        audiopaths_sid_text_new = []
+        lengths = []
+        skipped_phone = 0
+        skipped_dur = 0
+        for audiopath in tqdm(self.audiopaths_sid_text):
+            try:
+                phoneme = self.phoneme_data[audiopath][0]
+                phoneme = phoneme.split(" ")
+                phoneme_ids = cleaned_text_to_sequence(phoneme, version)
+            except Exception:
+                print(f"{audiopath} not in self.phoneme_data !")
+                skipped_phone += 1
+                continue
+            size = os.path.getsize("%s/%s" % (self.path5, audiopath))
+            duration = size / self.sampling_rate / 2
+            if duration == 0:
+                print(f"Zero duration for {audiopath}, skipping...")
+                skipped_dur += 1
+                continue
+            if 54 > duration > 0.6 or self.val:
+                audiopaths_sid_text_new.append([audiopath, phoneme_ids])
+                lengths.append(size // (2 * self.hop_length))
+            else:
+                skipped_dur += 1
+                continue
+        print("skipped_phone: ", skipped_phone, ", skipped_dur: ", skipped_dur)
+        print("total left: ", len(audiopaths_sid_text_new))
+        assert len(audiopaths_sid_text_new) > 1  # 至少能凑够batch size，这里todo
+        self.audiopaths_sid_text = audiopaths_sid_text_new
+        self.lengths = lengths
+    def get_audio_text_speaker_pair(self, audiopath_sid_text):
+        audiopath, phoneme_ids = audiopath_sid_text
+        text = torch.FloatTensor(phoneme_ids)
+        try:
+            spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
+            with torch.no_grad():
+                ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
+                if ssl.shape[-1] != spec.shape[-1]:
+                    typee = ssl.dtype
+                    ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
+                ssl.requires_grad = False
+                if self.is_v2Pro:
+                    sv_emb = torch.load("%s/%s.pt" % (self.path7, audiopath), map_location="cpu")
+        except:
+            traceback.print_exc()
+            spec = torch.zeros(1025, 100)
+            wav = torch.zeros(1, 100 * self.hop_length)
+            ssl = torch.zeros(1, 768, 100)
+            text = text[-1:]
+            if self.is_v2Pro:
+                sv_emb = torch.zeros(1, 20480)
+            print("load audio or ssl error!!!!!!", audiopath)
+        if self.is_v2Pro:
+            return (ssl, spec, wav, text, sv_emb)
+        else:
+            return (ssl, spec, wav, text)
+    def get_audio(self, filename):
+        audio_array = load_audio(filename, self.sampling_rate)  # load_audio的方法是已经归一化到-1~1之间的，不用再/32768
+        audio = torch.FloatTensor(audio_array)  # /32768
+        audio_norm = audio
+        audio_norm = audio_norm.unsqueeze(0)
+        spec = spectrogram_torch(
+            audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
+        )
+        spec = torch.squeeze(spec, 0)
+        return spec, audio_norm
+    def get_sid(self, sid):
+        sid = torch.LongTensor([int(sid)])
+        return sid
+    def __getitem__(self, index):
+        # with torch.no_grad():
+        return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
+    def __len__(self):
+        return len(self.audiopaths_sid_text)
+    def random_slice(self, ssl, wav, mel):
+        assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, ("first", ssl.shape, wav.shape)
+        len_mel = mel.shape[1]
+        if self.val:
+            reference_mel = mel[:, : len_mel // 3]
+            return reference_mel, ssl, wav, mel
+        dir = random.randint(0, 1)
+        sep_point = random.randint(int(len_mel // 3), int(len_mel // 3 * 2))
+        if dir == 0:
+            reference_mel = mel[:, :sep_point]
+            ssl = ssl[:, :, sep_point:]
+            wav2 = wav[:, sep_point * self.hop_length :]
+            mel = mel[:, sep_point:]
+        else:
+            reference_mel = mel[:, sep_point:]
+            ssl = ssl[:, :, :sep_point]
+            wav2 = wav[:, : sep_point * self.hop_length]
+            mel = mel[:, :sep_point]
+        assert abs(ssl.shape[-1] - wav2.shape[-1] // self.hop_length) < 3, (
+            ssl.shape,
+            wav.shape,
+            wav2.shape,
+            mel.shape,
+            sep_point,
+            self.hop_length,
+            sep_point * self.hop_length,
+            dir,
+        )
+        return reference_mel, ssl, wav2, mel
+class TextAudioSpeakerCollate:
+    """Zero-pads model inputs and targets"""
+    def __init__(self, return_ids=False, version=None):
+        self.return_ids = return_ids
+        self.is_v2Pro = version in {"v2Pro", "v2ProPlus"}
+    def __call__(self, batch):
+        """Collate's training batch from normalized text, audio and speaker identities
+        PARAMS
+        ------
+        batch: [text_normalized, spec_normalized, wav_normalized, sid]
+        """
+        # Right zero-pad all one-hot text sequences to max input length
+        _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True)
+        max_ssl_len = max([x[0].size(2) for x in batch])
+        max_ssl_len = int(2 * ((max_ssl_len // 2) + 1))
+        max_spec_len = max([x[1].size(1) for x in batch])
+        max_spec_len = int(2 * ((max_spec_len // 2) + 1))
+        max_wav_len = max([x[2].size(1) for x in batch])
+        max_text_len = max([x[3].size(0) for x in batch])
+        ssl_lengths = torch.LongTensor(len(batch))
+        spec_lengths = torch.LongTensor(len(batch))
+        wav_lengths = torch.LongTensor(len(batch))
+        text_lengths = torch.LongTensor(len(batch))
+        spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
+        wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
+        ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
+        text_padded = torch.LongTensor(len(batch), max_text_len)
+        spec_padded.zero_()
+        wav_padded.zero_()
+        ssl_padded.zero_()
+        text_padded.zero_()
+        if self.is_v2Pro:
+            sv_embs = torch.FloatTensor(len(batch), 20480)
+        for i in range(len(ids_sorted_decreasing)):
+            row = batch[ids_sorted_decreasing[i]]
+            ssl = row[0]
+            ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
+            ssl_lengths[i] = ssl.size(2)
+            spec = row[1]
+            spec_padded[i, :, : spec.size(1)] = spec
+            spec_lengths[i] = spec.size(1)
+            wav = row[2]
+            wav_padded[i, :, : wav.size(1)] = wav
+            wav_lengths[i] = wav.size(1)
+            text = row[3]
+            text_padded[i, : text.size(0)] = text
+            text_lengths[i] = text.size(0)
+            if self.is_v2Pro:
+                sv_embs[i] = row[4]
+        if self.is_v2Pro:
+            return (
+                ssl_padded,
+                ssl_lengths,
+                spec_padded,
+                spec_lengths,
+                wav_padded,
+                wav_lengths,
+                text_padded,
+                text_lengths,
+                sv_embs,
+            )
+        else:
+            return (
+                ssl_padded,
+                ssl_lengths,
+                spec_padded,
+                spec_lengths,
+                wav_padded,
+                wav_lengths,
+                text_padded,
+                text_lengths,
+            )
+class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
+    """
+    1) loads audio, speaker_id, text pairs
+    2) normalizes text and converts them to sequences of integers
+    3) computes spectrograms from audio files.
+    """
+    def __init__(self, hparams, val=False):
+        exp_dir = hparams.exp_dir
+        self.path2 = "%s/2-name2text.txt" % exp_dir
+        self.path4 = "%s/4-cnhubert" % exp_dir
+        self.path5 = "%s/5-wav32k" % exp_dir
+        assert os.path.exists(self.path2)
+        assert os.path.exists(self.path4)
+        assert os.path.exists(self.path5)
+        names4 = set([name[:-3] for name in list(os.listdir(self.path4))])  # 去除.pt后缀
+        names5 = set(os.listdir(self.path5))
+        self.phoneme_data = {}
+        with open(self.path2, "r", encoding="utf8") as f:
+            lines = f.read().strip("\n").split("\n")
+        for line in lines:
+            tmp = line.split("\t")
+            if len(tmp) != 4:
+                continue
+            self.phoneme_data[tmp[0]] = [tmp[1]]
+        self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5)
+        tmp = self.audiopaths_sid_text
+        leng = len(tmp)
+        min_num = 100
+        if leng < min_num:
+            self.audiopaths_sid_text = []
+            for _ in range(max(2, int(min_num / leng))):
+                self.audiopaths_sid_text += tmp
+        self.max_wav_value = hparams.max_wav_value
+        self.sampling_rate = hparams.sampling_rate
+        self.filter_length = hparams.filter_length
+        self.hop_length = hparams.hop_length
+        self.win_length = hparams.win_length
+        self.sampling_rate = hparams.sampling_rate
+        self.val = val
+        random.seed(1234)
+        random.shuffle(self.audiopaths_sid_text)
+        print("phoneme_data_len:", len(self.phoneme_data.keys()))
+        print("wav_data_len:", len(self.audiopaths_sid_text))
+        audiopaths_sid_text_new = []
+        lengths = []
+        skipped_phone = 0
+        skipped_dur = 0
+        for audiopath in tqdm(self.audiopaths_sid_text):
+            try:
+                phoneme = self.phoneme_data[audiopath][0]
+                phoneme = phoneme.split(" ")
+                phoneme_ids = cleaned_text_to_sequence(phoneme, version)
+            except Exception:
+                print(f"{audiopath} not in self.phoneme_data !")
+                skipped_phone += 1
+                continue
+            size = os.path.getsize("%s/%s" % (self.path5, audiopath))
+            duration = size / self.sampling_rate / 2
+            if duration == 0:
+                print(f"Zero duration for {audiopath}, skipping...")
+                skipped_dur += 1
+                continue
+            if 54 > duration > 0.6 or self.val:
+                audiopaths_sid_text_new.append([audiopath, phoneme_ids])
+                lengths.append(size // (2 * self.hop_length))
+            else:
+                skipped_dur += 1
+                continue
+        print("skipped_phone: ", skipped_phone, ", skipped_dur: ", skipped_dur)
+        print("total left: ", len(audiopaths_sid_text_new))
+        assert len(audiopaths_sid_text_new) > 1  # 至少能凑够batch size，这里todo
+        self.audiopaths_sid_text = audiopaths_sid_text_new
+        self.lengths = lengths
+        self.spec_min = -12
+        self.spec_max = 2
+        self.filter_length_mel = self.win_length_mel = 1024
+        self.hop_length_mel = 256
+        self.n_mel_channels = 100
+        self.sampling_rate_mel = 24000
+        self.mel_fmin = 0
+        self.mel_fmax = None
+    def norm_spec(self, x):
+        return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
+    def get_audio_text_speaker_pair(self, audiopath_sid_text):
+        audiopath, phoneme_ids = audiopath_sid_text
+        text = torch.FloatTensor(phoneme_ids)
+        try:
+            spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath))
+            with torch.no_grad():
+                ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
+                if ssl.shape[-1] != spec.shape[-1]:
+                    typee = ssl.dtype
+                    ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
+                ssl.requires_grad = False
+        except:
+            traceback.print_exc()
+            mel = torch.zeros(100, 180)
+            # wav = torch.zeros(1, 96 * self.hop_length)
+            spec = torch.zeros(1025, 96)
+            ssl = torch.zeros(1, 768, 96)
+            text = text[-1:]
+            print("load audio or ssl error!!!!!!", audiopath)
+        return (ssl, spec, mel, text)
+    def get_audio(self, filename):
+        audio_array = load_audio(filename, self.sampling_rate)  # load_audio的方法是已经归一化到-1~1之间的，不用再/32768
+        audio = torch.FloatTensor(audio_array)  # /32768
+        audio_norm = audio
+        audio_norm = audio_norm.unsqueeze(0)
+        audio_array24 = load_audio(
+            filename, 24000
+        )  # load_audio的方法是已经归一化到-1~1之间的，不用再/32768######这里可以用GPU重采样加速
+        audio24 = torch.FloatTensor(audio_array24)  # /32768
+        audio_norm24 = audio24
+        audio_norm24 = audio_norm24.unsqueeze(0)
+        spec = spectrogram_torch(
+            audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
+        )
+        spec = torch.squeeze(spec, 0)
+        spec1 = spectrogram_torch(
+            audio_norm24,
+            self.filter_length_mel,
+            self.sampling_rate_mel,
+            self.hop_length_mel,
+            self.win_length_mel,
+            center=False,
+        )
+        mel = spec_to_mel_torch(
+            spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax
+        )
+        mel = torch.squeeze(mel, 0)
+        mel = self.norm_spec(mel)
+        # print(1111111,spec.shape,mel.shape)
+        return spec, mel
+    def get_sid(self, sid):
+        sid = torch.LongTensor([int(sid)])
+        return sid
+    def __getitem__(self, index):
+        # with torch.no_grad():
+        return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
+    def __len__(self):
+        return len(self.audiopaths_sid_text)
+class TextAudioSpeakerCollateV3:
+    """Zero-pads model inputs and targets"""
+    def __init__(self, return_ids=False):
+        self.return_ids = return_ids
+    def __call__(self, batch):
+        """Collate's training batch from normalized text, audio and speaker identities
+        PARAMS
+        ------
+        batch: [text_normalized, spec_normalized, wav_normalized, sid]
+        """
+        # ssl, spec, wav,mel, text
+        # Right zero-pad all one-hot text sequences to max input length
+        _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True)
+        # (ssl, spec,mel, text)
+        max_ssl_len = max([x[0].size(2) for x in batch])
+        max_ssl_len1 = int(8 * ((max_ssl_len // 8) + 1))
+        max_ssl_len = int(2 * ((max_ssl_len // 2) + 1))
+        # max_ssl_len = int(8 * ((max_ssl_len // 8) + 1))
+        # max_ssl_len1=max_ssl_len
+        max_spec_len = max([x[1].size(1) for x in batch])
+        max_spec_len = int(2 * ((max_spec_len // 2) + 1))
+        # max_wav_len = max([x[2].size(1) for x in batch])
+        max_text_len = max([x[3].size(0) for x in batch])
+        max_mel_len = int(max_ssl_len1 * 1.25 * 1.5)  ###24000/256,32000/640=16000/320
+        ssl_lengths = torch.LongTensor(len(batch))
+        spec_lengths = torch.LongTensor(len(batch))
+        text_lengths = torch.LongTensor(len(batch))
+        # wav_lengths = torch.LongTensor(len(batch))
+        mel_lengths = torch.LongTensor(len(batch))
+        spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
+        mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_mel_len)
+        ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
+        text_padded = torch.LongTensor(len(batch), max_text_len)
+        # wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
+        spec_padded.zero_()
+        mel_padded.zero_()
+        ssl_padded.zero_()
+        text_padded.zero_()
+        # wav_padded.zero_()
+        for i in range(len(ids_sorted_decreasing)):
+            row = batch[ids_sorted_decreasing[i]]
+            # ssl, spec, wav,mel, text
+            ssl = row[0]
+            ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
+            ssl_lengths[i] = ssl.size(2)
+            spec = row[1]
+            spec_padded[i, :, : spec.size(1)] = spec
+            spec_lengths[i] = spec.size(1)
+            # wav = row[2]
+            # wav_padded[i, :, :wav.size(1)] = wav
+            # wav_lengths[i] = wav.size(1)
+            mel = row[2]
+            mel_padded[i, :, : mel.size(1)] = mel
+            mel_lengths[i] = mel.size(1)
+            text = row[3]
+            text_padded[i, : text.size(0)] = text
+            text_lengths[i] = text.size(0)
+        # return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths
+        return ssl_padded, spec_padded, mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, mel_lengths
+class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
+    """
+    1) loads audio, speaker_id, text pairs
+    2) normalizes text and converts them to sequences of integers
+    3) computes spectrograms from audio files.
+    """
+    def __init__(self, hparams, val=False):
+        exp_dir = hparams.exp_dir
+        self.path2 = "%s/2-name2text.txt" % exp_dir
+        self.path4 = "%s/4-cnhubert" % exp_dir
+        self.path5 = "%s/5-wav32k" % exp_dir
+        assert os.path.exists(self.path2)
+        assert os.path.exists(self.path4)
+        assert os.path.exists(self.path5)
+        names4 = set([name[:-3] for name in list(os.listdir(self.path4))])  # 去除.pt后缀
+        names5 = set(os.listdir(self.path5))
+        self.phoneme_data = {}
+        with open(self.path2, "r", encoding="utf8") as f:
+            lines = f.read().strip("\n").split("\n")
+        for line in lines:
+            tmp = line.split("\t")
+            if len(tmp) != 4:
+                continue
+            self.phoneme_data[tmp[0]] = [tmp[1]]
+        self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5)
+        tmp = self.audiopaths_sid_text
+        leng = len(tmp)
+        min_num = 100
+        if leng < min_num:
+            self.audiopaths_sid_text = []
+            for _ in range(max(2, int(min_num / leng))):
+                self.audiopaths_sid_text += tmp
+        self.max_wav_value = hparams.max_wav_value
+        self.sampling_rate = hparams.sampling_rate
+        self.filter_length = hparams.filter_length
+        self.hop_length = hparams.hop_length
+        self.win_length = hparams.win_length
+        self.sampling_rate = hparams.sampling_rate
+        self.val = val
+        random.seed(1234)
+        random.shuffle(self.audiopaths_sid_text)
+        print("phoneme_data_len:", len(self.phoneme_data.keys()))
+        print("wav_data_len:", len(self.audiopaths_sid_text))
+        audiopaths_sid_text_new = []
+        lengths = []
+        skipped_phone = 0
+        skipped_dur = 0
+        for audiopath in tqdm(self.audiopaths_sid_text):
+            try:
+                phoneme = self.phoneme_data[audiopath][0]
+                phoneme = phoneme.split(" ")
+                phoneme_ids = cleaned_text_to_sequence(phoneme, version)
+            except Exception:
+                print(f"{audiopath} not in self.phoneme_data !")
+                skipped_phone += 1
+                continue
+            size = os.path.getsize("%s/%s" % (self.path5, audiopath))
+            duration = size / self.sampling_rate / 2
+            if duration == 0:
+                print(f"Zero duration for {audiopath}, skipping...")
+                skipped_dur += 1
+                continue
+            if 54 > duration > 0.6 or self.val:
+                audiopaths_sid_text_new.append([audiopath, phoneme_ids])
+                lengths.append(size // (2 * self.hop_length))
+            else:
+                skipped_dur += 1
+                continue
+        print("skipped_phone: ", skipped_phone, ", skipped_dur: ", skipped_dur)
+        print("total left: ", len(audiopaths_sid_text_new))
+        assert len(audiopaths_sid_text_new) > 1  # 至少能凑够batch size，这里todo
+        self.audiopaths_sid_text = audiopaths_sid_text_new
+        self.lengths = lengths
+        self.spec_min = -12
+        self.spec_max = 2
+        self.filter_length_mel = self.win_length_mel = 1280
+        self.hop_length_mel = 320
+        self.n_mel_channels = 100
+        self.sampling_rate_mel = 32000
+        self.mel_fmin = 0
+        self.mel_fmax = None
+    def norm_spec(self, x):
+        return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
+    def get_audio_text_speaker_pair(self, audiopath_sid_text):
+        audiopath, phoneme_ids = audiopath_sid_text
+        text = torch.FloatTensor(phoneme_ids)
+        try:
+            spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath))
+            with torch.no_grad():
+                ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
+                if ssl.shape[-1] != spec.shape[-1]:
+                    typee = ssl.dtype
+                    ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
+                ssl.requires_grad = False
+        except:
+            traceback.print_exc()
+            mel = torch.zeros(100, 192)
+            # wav = torch.zeros(1, 96 * self.hop_length)
+            spec = torch.zeros(1025, 96)
+            ssl = torch.zeros(1, 768, 96)
+            text = text[-1:]
+            print("load audio or ssl error!!!!!!", audiopath)
+        return (ssl, spec, mel, text)
+    def get_audio(self, filename):
+        audio_array = load_audio(filename, self.sampling_rate)  # load_audio的方法是已经归一化到-1~1之间的，不用再/32768
+        audio = torch.FloatTensor(audio_array)  # /32768
+        audio_norm = audio
+        audio_norm = audio_norm.unsqueeze(0)
+        spec = spectrogram_torch(
+            audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
+        )
+        spec = torch.squeeze(spec, 0)
+        spec1 = spectrogram_torch(audio_norm, 1280, 32000, 320, 1280, center=False)
+        mel = spec_to_mel_torch(spec1, 1280, 100, 32000, 0, None)
+        mel = self.norm_spec(torch.squeeze(mel, 0))
+        return spec, mel
+    def get_sid(self, sid):
+        sid = torch.LongTensor([int(sid)])
+        return sid
+    def __getitem__(self, index):
+        # with torch.no_grad():
+        return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
+    def __len__(self):
+        return len(self.audiopaths_sid_text)
+class TextAudioSpeakerCollateV4:
+    """Zero-pads model inputs and targets"""
+    def __init__(self, return_ids=False):
+        self.return_ids = return_ids
+    def __call__(self, batch):
+        """Collate's training batch from normalized text, audio and speaker identities
+        PARAMS
+        ------
+        batch: [text_normalized, spec_normalized, wav_normalized, sid]
+        """
+        # ssl, spec, wav,mel, text
+        # Right zero-pad all one-hot text sequences to max input length
+        _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True)
+        # (ssl, spec,mel, text)
+        max_ssl_len = max([x[0].size(2) for x in batch])
+        max_ssl_len = int(2 * ((max_ssl_len // 2) + 1))
+        max_spec_len = max([x[1].size(1) for x in batch])
+        max_spec_len = int(2 * ((max_spec_len // 2) + 1))
+        # max_wav_len = max([x[2].size(1) for x in batch])
+        max_text_len = max([x[3].size(0) for x in batch])
+        ssl_lengths = torch.LongTensor(len(batch))
+        spec_lengths = torch.LongTensor(len(batch))
+        text_lengths = torch.LongTensor(len(batch))
+        # wav_lengths = torch.LongTensor(len(batch))
+        mel_lengths = torch.LongTensor(len(batch))
+        spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
+        mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_spec_len * 2)
+        ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
+        text_padded = torch.LongTensor(len(batch), max_text_len)
+        # wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
+        spec_padded.zero_()
+        mel_padded.zero_()
+        ssl_padded.zero_()
+        text_padded.zero_()
+        # wav_padded.zero_()
+        for i in range(len(ids_sorted_decreasing)):
+            row = batch[ids_sorted_decreasing[i]]
+            # ssl, spec, wav,mel, text
+            ssl = row[0]
+            ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
+            ssl_lengths[i] = ssl.size(2)
+            spec = row[1]
+            spec_padded[i, :, : spec.size(1)] = spec
+            spec_lengths[i] = spec.size(1)
+            # wav = row[2]
+            # wav_padded[i, :, :wav.size(1)] = wav
+            # wav_lengths[i] = wav.size(1)
+            mel = row[2]
+            mel_padded[i, :, : mel.size(1)] = mel
+            mel_lengths[i] = mel.size(1)
+            text = row[3]
+            text_padded[i, : text.size(0)] = text
+            text_lengths[i] = text.size(0)
+        # return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths
+        return ssl_padded, spec_padded, mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, mel_lengths
+class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
+    """
+    1) loads audio, speaker_id, text pairs
+    2) normalizes text and converts them to sequences of integers
+    3) computes spectrograms from audio files.
+    """
+    def __init__(self, hparams, val=False):
+        exp_dir = hparams.exp_dir
+        self.path2 = "%s/2-name2text.txt" % exp_dir
+        self.path4 = "%s/4-cnhubert" % exp_dir
+        self.path5 = "%s/5-wav32k" % exp_dir
+        assert os.path.exists(self.path2)
+        assert os.path.exists(self.path4)
+        assert os.path.exists(self.path5)
+        names4 = set([name[:-3] for name in list(os.listdir(self.path4))])  # 去除.pt后缀
+        names5 = set(os.listdir(self.path5))
+        self.phoneme_data = {}
+        with open(self.path2, "r", encoding="utf8") as f:
+            lines = f.read().strip("\n").split("\n")
+        for line in lines:
+            tmp = line.split("\t")
+            if len(tmp) != 4:
+                continue
+            self.phoneme_data[tmp[0]] = [tmp[1]]
+        self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5)
+        tmp = self.audiopaths_sid_text
+        leng = len(tmp)
+        min_num = 100
+        if leng < min_num:
+            self.audiopaths_sid_text = []
+            for _ in range(max(2, int(min_num / leng))):
+                self.audiopaths_sid_text += tmp
+        self.max_wav_value = hparams.max_wav_value
+        self.sampling_rate = hparams.sampling_rate
+        self.filter_length = hparams.filter_length
+        self.hop_length = hparams.hop_length
+        self.win_length = hparams.win_length
+        self.sampling_rate = hparams.sampling_rate
+        self.val = val
+        random.seed(1234)
+        random.shuffle(self.audiopaths_sid_text)
+        print("phoneme_data_len:", len(self.phoneme_data.keys()))
+        print("wav_data_len:", len(self.audiopaths_sid_text))
+        audiopaths_sid_text_new = []
+        lengths = []
+        skipped_phone = 0
+        skipped_dur = 0
+        for audiopath in tqdm(self.audiopaths_sid_text):
+            try:
+                phoneme = self.phoneme_data[audiopath][0]
+                phoneme = phoneme.split(" ")
+                phoneme_ids = cleaned_text_to_sequence(phoneme, version)
+            except Exception:
+                print(f"{audiopath} not in self.phoneme_data !")
+                skipped_phone += 1
+                continue
+            size = os.path.getsize("%s/%s" % (self.path5, audiopath))
+            duration = size / self.sampling_rate / 2
+            if duration == 0:
+                print(f"Zero duration for {audiopath}, skipping...")
+                skipped_dur += 1
+                continue
+            if 54 > duration > 0.6 or self.val:
+                audiopaths_sid_text_new.append([audiopath, phoneme_ids])
+                lengths.append(size // (2 * self.hop_length))
+            else:
+                skipped_dur += 1
+                continue
+        print("skipped_phone: ", skipped_phone, ", skipped_dur: ", skipped_dur)
+        print("total left: ", len(audiopaths_sid_text_new))
+        assert len(audiopaths_sid_text_new) > 1  # 至少能凑够batch size，这里todo
+        self.audiopaths_sid_text = audiopaths_sid_text_new
+        self.lengths = lengths
+        self.spec_min = -12
+        self.spec_max = 2
+        self.filter_length_mel = self.win_length_mel = 1024
+        self.hop_length_mel = 256
+        self.n_mel_channels = 100
+        self.sampling_rate_mel = 24000
+        self.mel_fmin = 0
+        self.mel_fmax = None
+    def norm_spec(self, x):
+        return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
+    def get_audio_text_speaker_pair(self, audiopath_sid_text):
+        audiopath, phoneme_ids = audiopath_sid_text
+        text = torch.FloatTensor(phoneme_ids)
+        try:
+            spec, mel, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
+            with torch.no_grad():
+                ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
+                if ssl.shape[-1] != spec.shape[-1]:
+                    typee = ssl.dtype
+                    ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
+                ssl.requires_grad = False
+        except:
+            traceback.print_exc()
+            mel = torch.zeros(100, 180)
+            wav = torch.zeros(1, 96 * self.hop_length)
+            spec = torch.zeros(1025, 96)
+            ssl = torch.zeros(1, 768, 96)
+            text = text[-1:]
+            print("load audio or ssl error!!!!!!", audiopath)
+        return (ssl, spec, wav, mel, text)
+    def get_audio(self, filename):
+        audio_array = load_audio(filename, self.sampling_rate)  # load_audio的方法是已经归一化到-1~1之间的，不用再/32768
+        audio = torch.FloatTensor(audio_array)  # /32768
+        audio_norm = audio
+        audio_norm = audio_norm.unsqueeze(0)
+        audio_array24 = load_audio(
+            filename, 24000
+        )  # load_audio的方法是已经归一化到-1~1之间的，不用再/32768######这里可以用GPU重采样加速
+        audio24 = torch.FloatTensor(audio_array24)  # /32768
+        audio_norm24 = audio24
+        audio_norm24 = audio_norm24.unsqueeze(0)
+        spec = spectrogram_torch(
+            audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
+        )
+        spec = torch.squeeze(spec, 0)
+        spec1 = spectrogram_torch(
+            audio_norm24,
+            self.filter_length_mel,
+            self.sampling_rate_mel,
+            self.hop_length_mel,
+            self.win_length_mel,
+            center=False,
+        )
+        mel = spec_to_mel_torch(
+            spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax
+        )
+        mel = torch.squeeze(mel, 0)
+        mel = self.norm_spec(mel)
+        # print(1111111,spec.shape,mel.shape)
+        return spec, mel, audio_norm
+    def get_sid(self, sid):
+        sid = torch.LongTensor([int(sid)])
+        return sid
+    def __getitem__(self, index):
+        # with torch.no_grad():
+        return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
+    def __len__(self):
+        return len(self.audiopaths_sid_text)
+class TextAudioSpeakerCollateV3b:
+    """Zero-pads model inputs and targets"""
+    def __init__(self, return_ids=False):
+        self.return_ids = return_ids
+    def __call__(self, batch):
+        """Collate's training batch from normalized text, audio and speaker identities
+        PARAMS
+        ------
+        batch: [text_normalized, spec_normalized, wav_normalized, sid]
+        """
+        # ssl, spec, wav,mel, text
+        # Right zero-pad all one-hot text sequences to max input length
+        _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True)
+        # (ssl, spec,mel, text)
+        max_ssl_len = max([x[0].size(2) for x in batch])
+        max_ssl_len1 = int(8 * ((max_ssl_len // 8) + 1))
+        max_ssl_len = int(2 * ((max_ssl_len // 2) + 1))
+        # max_ssl_len = int(8 * ((max_ssl_len // 8) + 1))
+        # max_ssl_len1=max_ssl_len
+        max_spec_len = max([x[1].size(1) for x in batch])
+        max_spec_len = int(2 * ((max_spec_len // 2) + 1))
+        max_wav_len = max([x[2].size(1) for x in batch])
+        max_text_len = max([x[4].size(0) for x in batch])
+        max_mel_len = int(max_ssl_len1 * 1.25 * 1.5)  ###24000/256,32000/640=16000/320
+        ssl_lengths = torch.LongTensor(len(batch))
+        spec_lengths = torch.LongTensor(len(batch))
+        text_lengths = torch.LongTensor(len(batch))
+        wav_lengths = torch.LongTensor(len(batch))
+        mel_lengths = torch.LongTensor(len(batch))
+        spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
+        mel_padded = torch.FloatTensor(len(batch), batch[0][3].size(0), max_mel_len)
+        ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
+        text_padded = torch.LongTensor(len(batch), max_text_len)
+        wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
+        spec_padded.zero_()
+        mel_padded.zero_()
+        ssl_padded.zero_()
+        text_padded.zero_()
+        wav_padded.zero_()
+        for i in range(len(ids_sorted_decreasing)):
+            row = batch[ids_sorted_decreasing[i]]
+            # ssl, spec, wav,mel, text
+            ssl = row[0]
+            ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
+            ssl_lengths[i] = ssl.size(2)
+            spec = row[1]
+            spec_padded[i, :, : spec.size(1)] = spec
+            spec_lengths[i] = spec.size(1)
+            wav = row[2]
+            wav_padded[i, :, : wav.size(1)] = wav
+            wav_lengths[i] = wav.size(1)
+            mel = row[3]
+            mel_padded[i, :, : mel.size(1)] = mel
+            mel_lengths[i] = mel.size(1)
+            text = row[4]
+            text_padded[i, : text.size(0)] = text
+            text_lengths[i] = text.size(0)
+        return (
+            ssl_padded,
+            spec_padded,
+            mel_padded,
+            ssl_lengths,
+            spec_lengths,
+            text_padded,
+            text_lengths,
+            wav_padded,
+            wav_lengths,
+            mel_lengths,
+        )
+        # return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths,mel_lengths
+class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
+    """
+    Maintain similar input lengths in a batch.
+    Length groups are specified by boundaries.
+    Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
+    It removes samples which are not included in the boundaries.
+    Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
+    """
+    def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        self.lengths = dataset.lengths
+        self.batch_size = batch_size
+        self.boundaries = boundaries
+        self.buckets, self.num_samples_per_bucket = self._create_buckets()
+        self.total_size = sum(self.num_samples_per_bucket)
+        self.num_samples = self.total_size // self.num_replicas
+    def _create_buckets(self):
+        buckets = [[] for _ in range(len(self.boundaries) - 1)]
+        for i in range(len(self.lengths)):
+            length = self.lengths[i]
+            idx_bucket = self._bisect(length)
+            if idx_bucket != -1:
+                buckets[idx_bucket].append(i)
+        i = len(buckets) - 1
+        while i >= 0:
+            if len(buckets[i]) == 0:
+                buckets.pop(i)
+                self.boundaries.pop(i + 1)
+            i -= 1
+        num_samples_per_bucket = []
+        for i in range(len(buckets)):
+            len_bucket = len(buckets[i])
+            total_batch_size = self.num_replicas * self.batch_size
+            rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
+            num_samples_per_bucket.append(len_bucket + rem)
+        return buckets, num_samples_per_bucket
+    def __iter__(self):
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        indices = []
+        if self.shuffle:
+            for bucket in self.buckets:
+                indices.append(torch.randperm(len(bucket), generator=g).tolist())
+        else:
+            for bucket in self.buckets:
+                indices.append(list(range(len(bucket))))
+        batches = []
+        for i in range(len(self.buckets)):
+            bucket = self.buckets[i]
+            len_bucket = len(bucket)
+            ids_bucket = indices[i]
+            num_samples_bucket = self.num_samples_per_bucket[i]
+            rem = num_samples_bucket - len_bucket
+            ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[: (rem % len_bucket)]
+            ids_bucket = ids_bucket[self.rank :: self.num_replicas]
+            for j in range(len(ids_bucket) // self.batch_size):
+                batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size : (j + 1) * self.batch_size]]
+                batches.append(batch)
+        if self.shuffle:
+            batch_ids = torch.randperm(len(batches), generator=g).tolist()
+            batches = [batches[i] for i in batch_ids]
+        self.batches = batches
+        assert len(self.batches) * self.batch_size == self.num_samples
+        return iter(self.batches)
+    def _bisect(self, x, lo=0, hi=None):
+        if hi is None:
+            hi = len(self.boundaries) - 1
+        if hi > lo:
+            mid = (hi + lo) // 2
+            if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
+                return mid
+            elif x <= self.boundaries[mid]:
+                return self._bisect(x, lo, mid)
+            else:
+                return self._bisect(x, mid + 1, hi)
+        else:
+            return -1
+    def __len__(self):
+        return self.num_samples // self.batch_size

GPT_SoVITS/module/losses.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import math
+import torch
+def feature_loss(fmap_r, fmap_g):
+    loss = torch.tensor(0).to(fmap_r[0][0].device)
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            rl = rl.float().detach()
+            gl = gl.float()
+            loss = torch.mean(torch.abs(rl - gl)) + loss
+    return loss * 2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = torch.tensor(0).to(disc_real_outputs[0].device)
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        dr = dr.float()
+        dg = dg.float()
+        r_loss = torch.mean((1 - dr) ** 2)
+        g_loss = torch.mean(dg**2)
+        loss = r_loss + g_loss + loss
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = torch.tensor(0).to(disc_outputs[0].device)
+    gen_losses = []
+    for dg in disc_outputs:
+        dg = dg.float()
+        l = torch.mean((1 - dg) ** 2)
+        gen_losses.append(l)
+        loss = l + loss
+    return loss, gen_losses
+def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
+    """
+    z_p, logs_q: [b, h, t_t]
+    m_p, logs_p: [b, h, t_t]
+    """
+    z_p = z_p.float()
+    logs_q = logs_q.float()
+    m_p = m_p.float()
+    logs_p = logs_p.float()
+    z_mask = z_mask.float()
+    kl = logs_p - logs_q - 0.5
+    kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
+    kl = torch.sum(kl * z_mask)
+    l = kl / torch.sum(z_mask)
+    return l
+def mle_loss(z, m, logs, logdet, mask):
+    l = torch.sum(logs) + 0.5 * torch.sum(
+        torch.exp(-2 * logs) * ((z - m) ** 2)
+    )  # neg normal likelihood w/o the constant term
+    l = l - torch.sum(logdet)  # log jacobian determinant
+    l = l / torch.sum(torch.ones_like(z) * mask)  # averaging across batch, channel and time axes
+    l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
+    return l

GPT_SoVITS/module/mel_processing.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import torch
+from librosa.filters import mel as librosa_mel_fn
+MAX_WAV_VALUE = 32768.0
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+    if torch.min(y) < -1.2:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.2:
+        print("max value is ", torch.max(y))
+    global hann_window
+    dtype_device = str(y.dtype) + "_" + str(y.device)
+    # wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    key = "%s-%s-%s-%s-%s" % (dtype_device, n_fft, sampling_rate, hop_size, win_size)
+    # if wnsize_dtype_device not in hann_window:
+    if key not in hann_window:
+        # hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+        hann_window[key] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
+    )
+    y = y.squeeze(1)
+    # spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+    spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[key],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = spec.abs().pow_(2).add_(1e-8).sqrt_()
+    return spec
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+    global mel_basis
+    dtype_device = str(spec.dtype) + "_" + str(spec.device)
+    # fmax_dtype_device = str(fmax) + '_' + dtype_device
+    key = "%s-%s-%s-%s-%s-%s" % (dtype_device, n_fft, num_mels, sampling_rate, fmin, fmax)
+    # if fmax_dtype_device not in mel_basis:
+    if key not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        # mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
+        mel_basis[key] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
+    # spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = torch.matmul(mel_basis[key], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.2:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.2:
+        print("max value is ", torch.max(y))
+    global mel_basis, hann_window
+    dtype_device = str(y.dtype) + "_" + str(y.device)
+    # fmax_dtype_device = str(fmax) + '_' + dtype_device
+    fmax_dtype_device = "%s-%s-%s-%s-%s-%s-%s-%s" % (
+        dtype_device,
+        n_fft,
+        num_mels,
+        sampling_rate,
+        hop_size,
+        win_size,
+        fmin,
+        fmax,
+    )
+    # wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    wnsize_dtype_device = fmax_dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
+    )
+    y = y.squeeze(1)
+    spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[wnsize_dtype_device],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = spec.abs().pow_(2).add_(1e-8).sqrt_()
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec

GPT_SoVITS/module/models.py ADDED Viewed

	@@ -0,0 +1,1411 @@

+import contextlib
+import math
+import random
+import torch
+from torch import nn
+from torch.cuda.amp import autocast
+from torch.nn import Conv1d, Conv2d, ConvTranspose1d
+from torch.nn import functional as F
+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+from GPT_SoVITS.f5_tts.model import DiT
+from GPT_SoVITS.text import symbols as symbols_v1
+from GPT_SoVITS.text import symbols2 as symbols_v2
+from GPT_SoVITS.utils import HParams
+from tools.my_utils import _open_file
+from . import attentions, commons, modules
+from .commons import get_padding, init_weights
+from .mrte_model import MRTE
+from .quantize import ResidualVectorQuantizer
+def set_serialization():
+    torch.serialization.add_safe_globals([(HParams, "utils.HParams")])
+    torch.serialization._open_file = _open_file
+set_serialization()
+class StochasticDurationPredictor(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        filter_channels = in_channels  # it needs to be removed from future version.
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.log_flow = modules.Log()
+        self.flows = nn.ModuleList()
+        self.flows.append(modules.ElementwiseAffine(2))
+        for i in range(n_flows):
+            self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
+            self.flows.append(modules.Flip())
+        self.post_pre = nn.Conv1d(1, filter_channels, 1)
+        self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
+        self.post_flows = nn.ModuleList()
+        self.post_flows.append(modules.ElementwiseAffine(2))
+        for i in range(4):
+            self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
+            self.post_flows.append(modules.Flip())
+        self.pre = nn.Conv1d(in_channels, filter_channels, 1)
+        self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
+    def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
+        x = torch.detach(x)
+        x = self.pre(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.convs(x, x_mask)
+        x = self.proj(x) * x_mask
+        if not reverse:
+            flows = self.flows
+            assert w is not None
+            logdet_tot_q = 0
+            h_w = self.post_pre(w)
+            h_w = self.post_convs(h_w, x_mask)
+            h_w = self.post_proj(h_w) * x_mask
+            e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
+            z_q = e_q
+            for flow in self.post_flows:
+                z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
+                logdet_tot_q += logdet_q
+            z_u, z1 = torch.split(z_q, [1, 1], 1)
+            u = torch.sigmoid(z_u) * x_mask
+            z0 = (w - u) * x_mask
+            logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2])
+            logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) - logdet_tot_q
+            logdet_tot = 0
+            z0, logdet = self.log_flow(z0, x_mask)
+            logdet_tot += logdet
+            z = torch.cat([z0, z1], 1)
+            for flow in flows:
+                z, logdet = flow(z, x_mask, g=x, reverse=reverse)
+                logdet_tot = logdet_tot + logdet
+            nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) - logdet_tot
+            return nll + logq  # [b]
+        else:
+            flows = list(reversed(self.flows))
+            flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
+            z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
+            for flow in flows:
+                z = flow(z, x_mask, g=x, reverse=reverse)
+            z0, z1 = torch.split(z, [1, 1], 1)
+            logw = z0
+            return logw
+class DurationPredictor(nn.Module):
+    def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+        self.drop = nn.Dropout(p_dropout)
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
+        self.norm_1 = modules.LayerNorm(filter_channels)
+        self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
+        self.norm_2 = modules.LayerNorm(filter_channels)
+        self.proj = nn.Conv1d(filter_channels, 1, 1)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+    def forward(self, x, x_mask, g=None):
+        x = torch.detach(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_1(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_2(x)
+        x = self.drop(x)
+        x = self.proj(x * x_mask)
+        return x * x_mask
+class TextEncoder(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        latent_channels=192,
+        version="v2",
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.latent_channels = latent_channels
+        self.version = version
+        self.ssl_proj = nn.Conv1d(768, hidden_channels, 1)
+        self.encoder_ssl = attentions.Encoder(
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers // 2,
+            kernel_size,
+            p_dropout,
+        )
+        self.encoder_text = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        if self.version == "v1":
+            symbols = symbols_v1.symbols
+        else:
+            symbols = symbols_v2.symbols
+        self.text_embedding = nn.Embedding(len(symbols), hidden_channels)
+        self.mrte = MRTE()
+        self.encoder2 = attentions.Encoder(
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers // 2,
+            kernel_size,
+            p_dropout,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, y, y_lengths, text, text_lengths, ge, speed=1, test=None):
+        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
+        y = self.ssl_proj(y * y_mask) * y_mask
+        y = self.encoder_ssl(y * y_mask, y_mask)
+        text_mask = torch.unsqueeze(commons.sequence_mask(text_lengths, text.size(1)), 1).to(y.dtype)
+        if test == 1:
+            text[:, :] = 0
+        text = self.text_embedding(text).transpose(1, 2)
+        text = self.encoder_text(text * text_mask, text_mask)
+        y = self.mrte(y, y_mask, text, text_mask, ge)
+        y = self.encoder2(y * y_mask, y_mask)
+        if speed != 1:
+            y = F.interpolate(y, size=int(y.shape[-1] / speed) + 1, mode="linear")
+            y_mask = F.interpolate(y_mask, size=y.shape[-1], mode="nearest")
+        stats = self.proj(y) * y_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return y, m, logs, y_mask
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, g=None):
+        if g != None:
+            g = g.detach()
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+class Encoder(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+    def forward(self, x, x_lengths, g=None):
+        if g != None:
+            g = g.detach()
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        return stats, x_mask
+class WNEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.norm = modules.LayerNorm(out_channels)
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        out = self.proj(x) * x_mask
+        out = self.norm(out)
+        return out
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels=0,
+        is_bias=False,
+    ):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=is_bias)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+v2pro_set = {"v2Pro", "v2ProPlus"}
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False, version=None):
+        super(MultiPeriodDiscriminator, self).__init__()
+        if version in v2pro_set:
+            periods = [2, 3, 5, 7, 11, 17, 23]
+        else:
+            periods = [2, 3, 5, 7, 11]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class ReferenceEncoder(nn.Module):
+    """
+    inputs --- [N, Ty/r, n_mels*r]  mels
+    outputs --- [N, ref_enc_gru_size]
+    """
+    def __init__(self, spec_channels, gin_channels=0):
+        super().__init__()
+        self.spec_channels = spec_channels
+        ref_enc_filters = [32, 32, 64, 64, 128, 128]
+        K = len(ref_enc_filters)
+        filters = [1] + ref_enc_filters
+        convs = [
+            weight_norm(
+                nn.Conv2d(
+                    in_channels=filters[i],
+                    out_channels=filters[i + 1],
+                    kernel_size=(3, 3),
+                    stride=(2, 2),
+                    padding=(1, 1),
+                )
+            )
+            for i in range(K)
+        ]
+        self.convs = nn.ModuleList(convs)
+        # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)])
+        out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
+        self.gru = nn.GRU(
+            input_size=ref_enc_filters[-1] * out_channels,
+            hidden_size=256 // 2,
+            batch_first=True,
+        )
+        self.proj = nn.Linear(128, gin_channels)
+    def forward(self, inputs):
+        N = inputs.size(0)
+        out = inputs.view(N, 1, -1, self.spec_channels)  # [N, 1, Ty, n_freqs]
+        for conv in self.convs:
+            out = conv(out)
+            # out = wn(out)
+            out = F.relu(out)  # [N, 128, Ty//2^K, n_mels//2^K]
+        out = out.transpose(1, 2)  # [N, Ty//2^K, 128, n_mels//2^K]
+        T = out.size(1)
+        N = out.size(0)
+        out = out.contiguous().view(N, T, -1)  # [N, Ty//2^K, 128*n_mels//2^K]
+        self.gru.flatten_parameters()
+        memory, out = self.gru(out)  # out --- [1, N, 128]
+        return self.proj(out.squeeze(0)).unsqueeze(-1)
+    def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
+        for i in range(n_convs):
+            L = (L - kernel_size + 2 * pad) // stride + 1
+        return L
+class Quantizer_module(torch.nn.Module):
+    def __init__(self, n_e, e_dim):
+        super(Quantizer_module, self).__init__()
+        self.embedding = nn.Embedding(n_e, e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / n_e, 1.0 / n_e)
+    def forward(self, x):
+        d = (
+            torch.sum(x**2, 1, keepdim=True)
+            + torch.sum(self.embedding.weight**2, 1)
+            - 2 * torch.matmul(x, self.embedding.weight.T)
+        )
+        min_indicies = torch.argmin(d, 1)
+        z_q = self.embedding(min_indicies)
+        return z_q, min_indicies
+class Quantizer(torch.nn.Module):
+    def __init__(self, embed_dim=512, n_code_groups=4, n_codes=160):
+        super(Quantizer, self).__init__()
+        assert embed_dim % n_code_groups == 0
+        self.quantizer_modules = nn.ModuleList(
+            [Quantizer_module(n_codes, embed_dim // n_code_groups) for _ in range(n_code_groups)]
+        )
+        self.n_code_groups = n_code_groups
+        self.embed_dim = embed_dim
+    def forward(self, xin):
+        # B, C, T
+        B, C, T = xin.shape
+        xin = xin.transpose(1, 2)
+        x = xin.reshape(-1, self.embed_dim)
+        x = torch.split(x, self.embed_dim // self.n_code_groups, dim=-1)
+        min_indicies = []
+        z_q = []
+        for _x, m in zip(x, self.quantizer_modules):
+            _z_q, _min_indicies = m(_x)
+            z_q.append(_z_q)
+            min_indicies.append(_min_indicies)  # B * T,
+        z_q = torch.cat(z_q, -1).reshape(xin.shape)
+        loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2)
+        z_q = xin + (z_q - xin).detach()
+        z_q = z_q.transpose(1, 2)
+        codes = torch.stack(min_indicies, -1).reshape(B, T, self.n_code_groups)
+        return z_q, loss, codes.transpose(1, 2)
+    def embed(self, x):
+        # idx: N, 4, T
+        x = x.transpose(1, 2)
+        x = torch.split(x, 1, 2)
+        ret = []
+        for q, embed in zip(x, self.quantizer_modules):
+            q = embed.embedding(q.squeeze(-1))
+            ret.append(q)
+        ret = torch.cat(ret, -1)
+        return ret.transpose(1, 2)  # N, C, T
+class CodePredictor(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        n_q=8,
+        dims=1024,
+        ssl_dim=768,
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.vq_proj = nn.Conv1d(ssl_dim, hidden_channels, 1)
+        self.ref_enc = modules.MelStyleEncoder(ssl_dim, style_vector_dim=hidden_channels)
+        self.encoder = attentions.Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout)
+        self.out_proj = nn.Conv1d(hidden_channels, (n_q - 1) * dims, 1)
+        self.n_q = n_q
+        self.dims = dims
+    def forward(self, x, x_mask, refer, codes, infer=False):
+        x = x.detach()
+        x = self.vq_proj(x * x_mask) * x_mask
+        g = self.ref_enc(refer, x_mask)
+        x = x + g
+        x = self.encoder(x * x_mask, x_mask)
+        x = self.out_proj(x * x_mask) * x_mask
+        logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose(2, 3)
+        target = codes[1:].transpose(0, 1)
+        if not infer:
+            logits = logits.reshape(-1, self.dims)
+            target = target.reshape(-1)
+            loss = torch.nn.functional.cross_entropy(logits, target)
+            return loss
+        else:
+            _, top10_preds = torch.topk(logits, 10, dim=-1)
+            correct_top10 = torch.any(top10_preds == target.unsqueeze(-1), dim=-1)
+            top3_acc = 100 * torch.mean(correct_top10.float()).detach().cpu().item()
+            print("Top-10 Accuracy:", top3_acc, "%")
+            pred_codes = torch.argmax(logits, dim=-1)
+            acc = 100 * torch.mean((pred_codes == target).float()).detach().cpu().item()
+            print("Top-1 Accuracy:", acc, "%")
+            return pred_codes.transpose(0, 1)
+class SynthesizerTrn(nn.Module):
+    """
+    Synthesizer for Training
+    """
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        n_speakers=0,
+        gin_channels=0,
+        use_sdp=True,
+        semantic_frame_rate=None,
+        freeze_quantizer=None,
+        version="v2",
+        **kwargs,
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.n_speakers = n_speakers
+        self.gin_channels = gin_channels
+        self.version = version
+        self.use_sdp = use_sdp
+        self.enc_p = TextEncoder(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            version=version,
+        )
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+        # self.version=os.environ.get("version","v1")
+        if self.version == "v1":
+            self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)
+        else:
+            self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)
+        ssl_dim = 768
+        assert semantic_frame_rate in ["25hz", "50hz"]
+        self.semantic_frame_rate = semantic_frame_rate
+        if semantic_frame_rate == "25hz":
+            self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2)
+        else:
+            self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1)
+        self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024)
+        self.freeze_quantizer = freeze_quantizer
+        self.is_v2pro = self.version in v2pro_set
+        if self.is_v2pro:
+            self.sv_emb = nn.Linear(20480, gin_channels)
+            self.ge_to512 = nn.Linear(gin_channels, 512)
+            self.prelu = nn.PReLU(num_parameters=gin_channels)
+    def forward(self, ssl, y, y_lengths, text, text_lengths, sv_emb=None):
+        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
+        if self.version == "v1":
+            ge = self.ref_enc(y * y_mask, y_mask)
+        else:
+            ge = self.ref_enc(y[:, :704] * y_mask, y_mask)
+        if self.is_v2pro:
+            sv_emb = self.sv_emb(sv_emb)  # B*20480->B*512
+            ge += sv_emb.unsqueeze(-1)
+            ge = self.prelu(ge)
+            ge512 = self.ge_to512(ge.transpose(2, 1)).transpose(2, 1)
+        with autocast(enabled=False):
+            maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext()
+            with maybe_no_grad:
+                if self.freeze_quantizer:
+                    self.ssl_proj.eval()
+                    self.quantizer.eval()
+            ssl = self.ssl_proj(ssl)
+            quantized, codes, commit_loss, quantized_list = self.quantizer(ssl, layers=[0])
+        if self.semantic_frame_rate == "25hz":
+            quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
+        x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge512 if self.is_v2pro else ge)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge)
+        z_p = self.flow(z, y_mask, g=ge)
+        z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
+        o = self.dec(z_slice, g=ge)
+        return (
+            o,
+            commit_loss,
+            ids_slice,
+            y_mask,
+            y_mask,
+            (z, z_p, m_p, logs_p, m_q, logs_q),
+            quantized,
+        )
+    def infer(self, ssl, y, y_lengths, text, text_lengths, test=None, noise_scale=0.5):
+        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
+        if self.version == "v1":
+            ge = self.ref_enc(y * y_mask, y_mask)
+        else:
+            ge = self.ref_enc(y[:, :704] * y_mask, y_mask)
+        ssl = self.ssl_proj(ssl)
+        quantized, codes, commit_loss, _ = self.quantizer(ssl, layers=[0])
+        if self.semantic_frame_rate == "25hz":
+            quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
+        x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, test=test)
+        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+        z = self.flow(z_p, y_mask, g=ge, reverse=True)
+        o = self.dec((z * y_mask)[:, :, :], g=ge)
+        return o, y_mask, (z, z_p, m_p, logs_p)
+    def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None):
+        def get_ge(refer, sv_emb):
+            ge = None
+            if refer is not None:
+                refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device)
+                refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype)
+                if self.version == "v1":
+                    ge = self.ref_enc(refer * refer_mask, refer_mask)
+                else:
+                    ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
+                if self.is_v2pro:
+                    sv_emb = self.sv_emb(sv_emb)  # B*20480->B*512
+                    ge += sv_emb.unsqueeze(-1)
+                    ge = self.prelu(ge)
+            return ge
+        if type(refer) == list:
+            ges = []
+            for idx, _refer in enumerate(refer):
+                ge = get_ge(_refer, sv_emb[idx] if self.is_v2pro else None)
+                ges.append(ge)
+            ge = torch.stack(ges, 0).mean(0)
+        else:
+            ge = get_ge(refer, sv_emb)
+        y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device)
+        text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)
+        quantized = self.quantizer.decode(codes)
+        if self.semantic_frame_rate == "25hz":
+            quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
+        x, m_p, logs_p, y_mask = self.enc_p(
+            quantized,
+            y_lengths,
+            text,
+            text_lengths,
+            self.ge_to512(ge.transpose(2, 1)).transpose(2, 1) if self.is_v2pro else ge,
+            speed,
+        )
+        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+        z = self.flow(z_p, y_mask, g=ge, reverse=True)
+        o = self.dec((z * y_mask)[:, :, :], g=ge)
+        return o
+    def extract_latent(self, x) -> torch.Tensor:
+        ssl = self.ssl_proj(x)
+        quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
+        return codes.transpose(0, 1)
+class CFM(torch.nn.Module):
+    def __init__(self, in_channels, dit):
+        super().__init__()
+        self.sigma_min = 1e-6
+        self.estimator = dit
+        self.in_channels = in_channels
+        self.criterion = torch.nn.MSELoss()
+        self.use_conditioner_cache = True
+    @torch.inference_mode()
+    def inference(self, mu, x_lens, prompt, n_timesteps, temperature=1.0, inference_cfg_rate=0):
+        """Forward diffusion"""
+        B, T = mu.size(0), mu.size(1)
+        x = torch.randn([B, self.in_channels, T], device=mu.device, dtype=mu.dtype) * temperature
+        prompt_len = prompt.size(-1)
+        prompt_x = torch.zeros_like(x, dtype=mu.dtype)
+        prompt_x[..., :prompt_len] = prompt[..., :prompt_len]
+        x[..., :prompt_len] = 0
+        mu = mu.transpose(2, 1)
+        t = 0
+        d = 1 / n_timesteps
+        text_cache = None
+        text_cfg_cache = None
+        dt_cache = None
+        d_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * d
+        for j in range(n_timesteps):
+            t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t
+            # v_pred = model(x, t_tensor, d_tensor, **extra_args)
+            v_pred, text_emb, dt = self.estimator(
+                x,
+                prompt_x,
+                x_lens,
+                t_tensor,
+                d_tensor,
+                mu,
+                use_grad_ckpt=False,
+                drop_audio_cond=False,
+                drop_text=False,
+                infer=True,
+                text_cache=text_cache,
+                dt_cache=dt_cache,
+            )
+            v_pred = v_pred.transpose(2, 1)
+            if self.use_conditioner_cache:
+                text_cache = text_emb
+                dt_cache = dt
+            if inference_cfg_rate > 1e-5:
+                neg, text_cfg_emb, _ = self.estimator(
+                    x,
+                    prompt_x,
+                    x_lens,
+                    t_tensor,
+                    d_tensor,
+                    mu,
+                    use_grad_ckpt=False,
+                    drop_audio_cond=True,
+                    drop_text=True,
+                    infer=True,
+                    text_cache=text_cfg_cache,
+                    dt_cache=dt_cache,
+                )
+                neg = neg.transpose(2, 1)
+                if self.use_conditioner_cache:
+                    text_cfg_cache = text_cfg_emb
+                v_pred = v_pred + (v_pred - neg) * inference_cfg_rate
+            x = x + d * v_pred
+            t = t + d
+            x[:, :, :prompt_len] = 0
+        return x
+    def forward(self, x1, x_lens, prompt_lens, mu, use_grad_ckpt):
+        b, _, t = x1.shape
+        t = torch.rand([b], device=mu.device, dtype=x1.dtype)
+        x0 = torch.randn_like(x1, device=mu.device)
+        vt = x1 - x0
+        xt = x0 + t[:, None, None] * vt
+        dt = torch.zeros_like(t, device=mu.device)
+        prompt = torch.zeros_like(x1)
+        for i in range(b):
+            prompt[i, :, : prompt_lens[i]] = x1[i, :, : prompt_lens[i]]
+            xt[i, :, : prompt_lens[i]] = 0
+        gailv = 0.3  # if ttime()>1736250488 else 0.1
+        if random.random() < gailv:
+            base = torch.randint(2, 8, (t.shape[0],), device=mu.device)
+            d = 1 / torch.pow(2, base)
+            d_input = d.clone()
+            d_input[d_input < 1e-2] = 0
+            # with torch.no_grad():
+            v_pred_1 = self.estimator(xt, prompt, x_lens, t, d_input, mu, use_grad_ckpt).transpose(2, 1).detach()
+            # v_pred_1 = self.diffusion(xt, t, d_input, cond=conditioning).detach()
+            x_mid = xt + d[:, None, None] * v_pred_1
+            # v_pred_2 = self.diffusion(x_mid, t+d, d_input, cond=conditioning).detach()
+            v_pred_2 = self.estimator(x_mid, prompt, x_lens, t + d, d_input, mu, use_grad_ckpt).transpose(2, 1).detach()
+            vt = (v_pred_1 + v_pred_2) / 2
+            vt = vt.detach()
+            dt = 2 * d
+        vt_pred = self.estimator(xt, prompt, x_lens, t, dt, mu, use_grad_ckpt).transpose(2, 1)
+        loss = 0
+        for i in range(b):
+            loss += self.criterion(vt_pred[i, :, prompt_lens[i] : x_lens[i]], vt[i, :, prompt_lens[i] : x_lens[i]])
+        loss /= b
+        return loss
+def set_no_grad(net_g):
+    for name, param in net_g.named_parameters():
+        param.requires_grad = False
+class SynthesizerTrnV3(nn.Module):
+    """
+    Synthesizer for Training
+    """
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        n_speakers=0,
+        gin_channels=0,
+        use_sdp=True,
+        semantic_frame_rate=None,
+        freeze_quantizer=None,
+        version="v3",
+        **kwargs,
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.n_speakers = n_speakers
+        self.gin_channels = gin_channels
+        self.version = version
+        self.model_dim = 512
+        self.use_sdp = use_sdp
+        self.enc_p = TextEncoder(
+            inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)
+        ssl_dim = 768
+        assert semantic_frame_rate in ["25hz", "50hz"]
+        self.semantic_frame_rate = semantic_frame_rate
+        if semantic_frame_rate == "25hz":
+            self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2)
+        else:
+            self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1)
+        self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024)
+        self.freeze_quantizer = freeze_quantizer
+        inter_channels2 = 512
+        self.bridge = nn.Sequential(nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), nn.LeakyReLU())
+        self.wns1 = Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8, gin_channels=gin_channels)
+        self.linear_mel = nn.Conv1d(inter_channels2, 100, 1, stride=1)
+        self.cfm = CFM(
+            100,
+            DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),
+        )  # text_dim is condition feature dim
+        if self.freeze_quantizer is True:
+            set_no_grad(self.ssl_proj)
+            set_no_grad(self.quantizer)
+            set_no_grad(self.enc_p)
+    def forward(
+        self, ssl, y, mel, ssl_lengths, y_lengths, text, text_lengths, mel_lengths, use_grad_ckpt
+    ):  # ssl_lengths no need now
+        with autocast(enabled=False):
+            y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
+            ge = self.ref_enc(y[:, :704] * y_mask, y_mask)
+            maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext()
+            with maybe_no_grad:
+                if self.freeze_quantizer:
+                    self.ssl_proj.eval()  #
+                    self.quantizer.eval()
+                    self.enc_p.eval()
+                ssl = self.ssl_proj(ssl)
+                quantized, codes, commit_loss, quantized_list = self.quantizer(ssl, layers=[0])
+                quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")  ##BCT
+                x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
+        fea = self.bridge(x)
+        fea = F.interpolate(fea, scale_factor=(1.875 if self.version == "v3" else 2), mode="nearest")  ##BCT
+        fea, y_mask_ = self.wns1(
+            fea, mel_lengths, ge
+        )  ##If the 1-minute fine-tuning works fine, no need to manually adjust the learning rate.
+        B = ssl.shape[0]
+        prompt_len_max = mel_lengths * 2 / 3
+        prompt_len = (torch.rand([B], device=fea.device) * prompt_len_max).floor().to(dtype=torch.long)
+        minn = min(mel.shape[-1], fea.shape[-1])
+        mel = mel[:, :, :minn]
+        fea = fea[:, :, :minn]
+        cfm_loss = self.cfm(mel, mel_lengths, prompt_len, fea, use_grad_ckpt)
+        return cfm_loss
+    @torch.no_grad()
+    def decode_encp(self, codes, text, refer, ge=None, speed=1):
+        # print(2333333,refer.shape)
+        # ge=None
+        if ge is None:
+            refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device)
+            refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype)
+            ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
+        y_lengths = torch.LongTensor([int(codes.size(2) * 2)]).to(codes.device)
+        if speed == 1:
+            sizee = int(codes.size(2) * (3.875 if self.version == "v3" else 4))
+        else:
+            sizee = int(codes.size(2) * (3.875 if self.version == "v3" else 4) / speed) + 1
+        y_lengths1 = torch.LongTensor([sizee]).to(codes.device)
+        text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)
+        quantized = self.quantizer.decode(codes)
+        if self.semantic_frame_rate == "25hz":
+            quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")  ##BCT
+        x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed)
+        fea = self.bridge(x)
+        fea = F.interpolate(fea, scale_factor=(1.875 if self.version == "v3" else 2), mode="nearest")  ##BCT
+        ####more wn paramter to learn mel
+        fea, y_mask_ = self.wns1(fea, y_lengths1, ge)
+        return fea, ge
+    def extract_latent(self, x):
+        ssl = self.ssl_proj(x)
+        quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
+        return codes.transpose(0, 1)
+class SynthesizerTrnV3b(nn.Module):
+    """
+    Synthesizer for Training
+    """
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        n_speakers=0,
+        gin_channels=0,
+        use_sdp=True,
+        semantic_frame_rate=None,
+        freeze_quantizer=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.n_speakers = n_speakers
+        self.gin_channels = gin_channels
+        self.model_dim = 512
+        self.use_sdp = use_sdp
+        self.enc_p = TextEncoder(
+            inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        # self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)###Rollback
+        self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)  ###Rollback
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels
+        )
+        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+        ssl_dim = 768
+        assert semantic_frame_rate in ["25hz", "50hz"]
+        self.semantic_frame_rate = semantic_frame_rate
+        if semantic_frame_rate == "25hz":
+            self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2)
+        else:
+            self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1)
+        self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024)
+        self.freeze_quantizer = freeze_quantizer
+        inter_channels2 = 512
+        self.bridge = nn.Sequential(nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), nn.LeakyReLU())
+        self.wns1 = Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8, gin_channels=gin_channels)
+        self.linear_mel = nn.Conv1d(inter_channels2, 100, 1, stride=1)
+        self.cfm = CFM(
+            100,
+            DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),
+        )  # text_dim is condition feature dim
+    def forward(self, ssl, y, mel, ssl_lengths, y_lengths, text, text_lengths, mel_lengths):  # ssl_lengths no need now
+        with autocast(enabled=False):
+            y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
+            ge = self.ref_enc(y[:, :704] * y_mask, y_mask)
+            # ge = self.ref_enc(y * y_mask, y_mask)#change back, new spec setting is whole 24k
+            # ge=None
+            maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext()
+            with maybe_no_grad:
+                if self.freeze_quantizer:
+                    self.ssl_proj.eval()
+                    self.quantizer.eval()
+                ssl = self.ssl_proj(ssl)
+                quantized, codes, commit_loss, quantized_list = self.quantizer(ssl, layers=[0])
+                quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")  ##BCT
+                x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge)
+        z_p = self.flow(z, y_mask, g=ge)
+        z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
+        o = self.dec(z_slice, g=ge)
+        fea = self.bridge(x)
+        fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")  ##BCT
+        fea, y_mask_ = self.wns1(fea, mel_lengths, ge)
+        learned_mel = self.linear_mel(fea)
+        B = ssl.shape[0]
+        prompt_len_max = mel_lengths * 2 / 3
+        prompt_len = (torch.rand([B], device=fea.device) * prompt_len_max).floor().to(dtype=torch.long)  #
+        minn = min(mel.shape[-1], fea.shape[-1])
+        mel = mel[:, :, :minn]
+        fea = fea[:, :, :minn]
+        cfm_loss = self.cfm(mel, mel_lengths, prompt_len, fea)  # fea==cond,y_lengths==target_mel_lengths#ge not need
+        return (
+            commit_loss,
+            cfm_loss,
+            F.mse_loss(learned_mel, mel),
+            o,
+            ids_slice,
+            y_mask,
+            y_mask,
+            (z, z_p, m_p, logs_p, m_q, logs_q),
+            quantized,
+        )
+    @torch.no_grad()
+    def decode_encp(self, codes, text, refer, ge=None):
+        # print(2333333,refer.shape)
+        # ge=None
+        if ge is None:
+            refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device)
+            refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype)
+            ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
+        y_lengths = torch.LongTensor([int(codes.size(2) * 2)]).to(codes.device)
+        y_lengths1 = torch.LongTensor([int(codes.size(2) * 2.5 * 1.5)]).to(codes.device)
+        text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)
+        quantized = self.quantizer.decode(codes)
+        if self.semantic_frame_rate == "25hz":
+            quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")  ##BCT
+        x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
+        fea = self.bridge(x)
+        fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")  ##BCT
+        ####more wn paramter to learn mel
+        fea, y_mask_ = self.wns1(fea, y_lengths1, ge)
+        return fea, ge
+    def extract_latent(self, x) -> torch.Tensor:
+        ssl = self.ssl_proj(x)
+        quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
+        return codes.transpose(0, 1)