jiaqili3
/

valle_demo

Model card Files Files and versions Community

HarryHe commited on May 12, 2024

Commit

f7c417a

1 Parent(s): 745a0a7

add files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

models/__init__.py +0 -0
models/valle_ar.py +265 -0
models/valle_nar.py +303 -0
modules/__init__.py +0 -0
modules/__pycache__/__init__.cpython-39.pyc +0 -0
modules/activation_functions/__init__.py +7 -0
modules/activation_functions/__pycache__/__init__.cpython-39.pyc +0 -0
modules/activation_functions/__pycache__/gated_activation_unit.cpython-39.pyc +0 -0
modules/activation_functions/__pycache__/snake.cpython-39.pyc +0 -0
modules/activation_functions/gated_activation_unit.py +61 -0
modules/activation_functions/snake.py +122 -0
modules/anti_aliasing/__init__.py +8 -0
modules/anti_aliasing/__pycache__/__init__.cpython-39.pyc +0 -0
modules/anti_aliasing/__pycache__/act.cpython-39.pyc +0 -0
modules/anti_aliasing/__pycache__/filter.cpython-39.pyc +0 -0
modules/anti_aliasing/__pycache__/resample.cpython-39.pyc +0 -0
modules/anti_aliasing/act.py +36 -0
modules/anti_aliasing/filter.py +99 -0
modules/anti_aliasing/resample.py +65 -0
modules/base/base_module.py +75 -0
modules/diffusion/__init__.py +7 -0
modules/diffusion/bidilconv/bidilated_conv.py +102 -0
modules/diffusion/bidilconv/residual_block.py +73 -0
modules/diffusion/karras/karras_diffusion.py +977 -0
modules/diffusion/karras/random_utils.py +177 -0
modules/diffusion/karras/sample.py +185 -0
modules/diffusion/unet/attention.py +241 -0
modules/diffusion/unet/basic.py +15 -0
modules/diffusion/unet/resblock.py +178 -0
modules/diffusion/unet/unet.py +310 -0
modules/distributions/__init__.py +0 -0
modules/distributions/distributions.py +107 -0
modules/duration_predictor/__init__.py +0 -0
modules/duration_predictor/standard_duration_predictor.py +53 -0
modules/duration_predictor/stochastic_duration_predictor.py +120 -0
modules/encoder/__init__.py +1 -0
modules/encoder/__pycache__/__init__.cpython-39.pyc +0 -0
modules/encoder/__pycache__/token_encoder.cpython-39.pyc +0 -0
modules/encoder/condition_encoder.py +251 -0
modules/encoder/conv_encoder.py +103 -0
modules/encoder/position_encoder.py +85 -0
modules/encoder/token_encoder.py +25 -0
modules/flow/modules.py +457 -0
modules/general/__init__.py +3 -0
modules/general/__pycache__/__init__.cpython-39.pyc +0 -0
modules/general/__pycache__/input_strategies.cpython-39.pyc +0 -0
modules/general/__pycache__/scaling.cpython-39.pyc +0 -0
modules/general/__pycache__/utils.cpython-39.pyc +0 -0
modules/general/input_strategies.py +130 -0
modules/general/scaling.py +1349 -0

models/__init__.py ADDED Viewed

File without changes

models/valle_ar.py ADDED Viewed

	@@ -0,0 +1,265 @@

+# python -m models.tts.valle_gpt.valle_ar
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaModel
+import torch
+import torch.nn.functional as F
+import numpy as np
+import os
+import torch.nn as nn
+class ValleAR(nn.Module):
+    def __init__(
+        self,
+        phone_vocab_size=256,
+        target_vocab_size=1024,
+        hidden_size=1024,
+        intermediate_size=4096,
+        num_hidden_layers=12,
+        num_attention_heads=16,
+        pad_token_id=1281,
+        bos_target_id=1282,
+        eos_target_id=1283,
+        bos_phone_id=1284,
+        eos_phone_id=1285,
+        use_input_embeds=False,
+        emb_dim=256,
+    ):
+        super(ValleAR, self).__init__()
+        self.config = LlamaConfig(
+            vocab_size=phone_vocab_size + target_vocab_size + 10,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_target_id,
+            eos_token_id=eos_target_id,
+        )
+        self.phone_vocab_size = phone_vocab_size
+        self.target_vocab_size = target_vocab_size
+        self.pad_token_id = pad_token_id
+        self.bos_target_id = bos_target_id
+        self.eos_target_id = eos_target_id
+        self.bos_phone_id = bos_phone_id
+        self.eos_phone_id = eos_phone_id
+        self.model = LlamaForCausalLM(self.config)
+        self.use_input_embeds = use_input_embeds
+        # no input embedding is used to provide speaker information
+        if self.use_input_embeds:
+            self.emb_linear = nn.Linear(emb_dim, hidden_size)
+            self.emb_linear.weight.data.normal_(mean=0.0, std=0.01)
+            self.emb_linear.bias.data.zero_()
+    def forward(
+        self, phone_ids, phone_mask, target_ids, target_mask, input_embeds=None
+    ):
+        if input_embeds is not None:
+            input_embeds = self.emb_linear(input_embeds)
+        phone_ids, phone_mask, phone_label = self.add_phone_eos_bos_label(
+            phone_ids,
+            phone_mask,
+            self.eos_phone_id,
+            self.bos_phone_id,
+            self.pad_token_id,
+        )
+        target_ids, target_mask, target_label = self.add_target_eos_bos_label(
+            target_ids,
+            target_mask,
+            self.eos_target_id,
+            self.bos_target_id,
+            self.pad_token_id,
+        )
+        input_token_ids = torch.cat([phone_ids, target_ids], dim=-1)
+        attention_mask = torch.cat([phone_mask, target_mask], dim=-1)
+        if input_embeds is not None:
+            raise NotImplementedError
+            attention_mask = torch.cat(
+                [
+                    torch.ones(
+                        (input_embeds.shape[0], input_embeds.shape[1]),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    ),
+                    attention_mask,
+                ],
+                dim=-1,
+            )
+        labels = torch.cat([phone_label, target_label], dim=-1)
+        if input_embeds is not None:
+            raise NotImplementedError
+            labels = torch.cat(
+                [
+                    -100
+                    * torch.ones(
+                        (input_embeds.shape[0], input_embeds.shape[1]),
+                        dtype=labels.dtype,
+                        device=labels.device,
+                    ),
+                    labels,
+                ],
+                dim=-1,
+            )
+        if input_embeds is not None:
+            raise NotImplementedError
+            inputs_embeds = torch.cat(
+                [input_embeds, self.model.model.embed_tokens(input_token_ids)], dim=1
+            )
+            out = self.model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                labels=labels,
+                return_dict=True,
+            )
+            return out
+        out = self.model(
+            input_token_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+            return_dict=True,
+        )
+        return out
+    def add_phone_eos_bos_label(
+        self, phone_ids, phone_mask, phone_eos_id, phone_bos_id, pad_token_id
+    ):
+        # phone_ids: [B, T]
+        # phone_mask: [B, T]
+        phone_ids = phone_ids + self.target_vocab_size * phone_mask
+        phone_ids = phone_ids * phone_mask
+        phone_ids = F.pad(phone_ids, (0, 1), value=0) + phone_eos_id * F.pad(
+            1 - phone_mask, (0, 1), value=1
+        ) # make pad token eos token, add eos token at the end
+        phone_mask = F.pad(phone_mask, (1, 0), value=1) # add eos mask
+        phone_ids = phone_ids * phone_mask + pad_token_id * (1 - phone_mask) # restore pad token ids
+        phone_ids = F.pad(phone_ids, (1, 0), value=phone_bos_id) # add bos token
+        phone_mask = F.pad(phone_mask, (1, 0), value=1) # add bos mask
+        phone_label = -100 * torch.ones_like(phone_ids) # loss for entire phone is not computed (passed to llama)
+        return phone_ids, phone_mask, phone_label
+    def add_target_eos_bos_label(
+        self, target_ids, target_mask, target_eos_id, target_bos_id, pad_token_id
+    ):
+        # target_ids: [B, T]
+        # target_mask: [B, T]
+        target_ids = target_ids * target_mask
+        target_ids = F.pad(target_ids, (0, 1), value=0) + target_eos_id * F.pad(
+            1 - target_mask, (0, 1), value=1
+        )
+        target_mask = F.pad(target_mask, (1, 0), value=1)
+        target_ids = target_ids * target_mask + pad_token_id * (1 - target_mask)
+        target_ids = F.pad(target_ids, (1, 0), value=target_bos_id)
+        target_mask = F.pad(target_mask, (1, 0), value=1)
+        target_label = target_ids * target_mask + (-100) * (1 - target_mask) # loss for target is computed on unmasked tokens
+        return target_ids, target_mask, target_label
+    def sample_hf(
+        self,
+        phone_ids, # the phones of prompt and target should be concatenated together
+        prompt_ids,
+        inputs_embeds=None,
+        max_length=2000,
+        temperature=1.0,
+        top_k=100,
+        top_p=0.9,
+        repeat_penalty=1.0,
+    ):
+        if inputs_embeds is not None:
+            inputs_embeds = self.emb_linear(inputs_embeds)
+        phone_mask = torch.ones_like(phone_ids)
+        prompt_mask = torch.ones_like(prompt_ids)
+        phone_ids, _, _ = self.add_phone_eos_bos_label(
+            phone_ids,
+            phone_mask,
+            self.eos_phone_id,
+            self.bos_phone_id,
+            self.pad_token_id,
+        )
+        prompt_ids, _, _ = self.add_target_eos_bos_label(
+            prompt_ids,
+            prompt_mask,
+            self.eos_target_id,
+            self.bos_target_id,
+            self.pad_token_id,
+        )
+        prompt_ids = prompt_ids[:, :-1] # remove end token. Make it continue mode
+        input_token_ids = torch.cat([phone_ids, prompt_ids], dim=-1)
+        if inputs_embeds is not None:
+            raise NotImplementedError
+            inputs_embeds = torch.cat(
+                [inputs_embeds, self.model.model.embed_tokens(input_token_ids)], dim=1
+            )
+            generated_ids = self.model.generate(
+                inputs_embeds=inputs_embeds,
+                do_sample=True,
+                max_length=max_length,
+                pad_token_id=self.pad_token_id,
+                eos_token_id=self.eos_target_id,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repeat_penalty,
+            )
+            gen_tokens = generated_ids[:, :-1]
+            return gen_tokens
+        input_length = input_token_ids.shape[1]
+        generated_ids = self.model.generate(
+            input_token_ids,
+            do_sample=True,
+            max_length=max_length,
+            pad_token_id=self.pad_token_id,
+            eos_token_id=self.eos_target_id,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repeat_penalty,
+        )
+        gen_tokens = generated_ids[:, input_length:-1]
+        return gen_tokens
+def test():
+    model = ValleAR()
+    phone_ids = torch.LongTensor([[1,2,3,4,5,0],
+                                  [1,2,3,4,5,6]])
+    phone_mask = torch.LongTensor([[1,1,1,0,0,0],
+                                   [1,1,1,0,0,0]])
+    target_ids = torch.LongTensor([765, 234, 123, 234, 123,599]).expand(2,-1)
+    target_mask = torch.LongTensor([1,1,1,1,0,0]).expand(2,-1)
+    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
+    for i in range(15):
+        optimizer.zero_grad()
+        out = model(
+            phone_ids=phone_ids,
+            phone_mask=phone_mask,
+            target_ids=target_ids,
+            target_mask=target_mask,
+        )
+        loss = out.loss
+        loss.backward()
+        optimizer.step()
+        print(f"iter={i}, {loss}.")
+    phone_ids = torch.LongTensor([1,2,3]).reshape(1,-1)
+    target_ids = torch.LongTensor([765, 234]).reshape(1,-1)
+    sampled = model.sample_hf(phone_ids, target_ids)
+    breakpoint()
+if __name__ == '__main__':
+    test()

models/valle_nar.py ADDED Viewed

	@@ -0,0 +1,303 @@

+from transformers import LlamaConfig, LlamaForCausalLM, LlamaModel
+import torch
+import torch.nn.functional as F
+import numpy as np
+import os
+import torch.nn as nn
+from typing import List, Optional, Tuple, Union
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+from transformers.models.bert.modeling_bert import BertEncoder
+from models.transformer.position_embedding import SinePositionalEmbedding
+NUM_PROMPT_TOKENS=225
+def initialize(module):
+    if isinstance(module, (nn.Linear, nn.Embedding, nn.modules.linear.NonDynamicallyQuantizableLinear)):
+        module.weight.data.normal_(mean=0.0, std=0.02)
+    if isinstance(module, nn.Linear) and module.bias is not None:
+        module.bias.data.zero_()
+from transformers.models.llama.modeling_llama import CrossEntropyLoss
+from easydict import EasyDict as edict
+from modules.encoder import TokenEmbedding
+from modules.norms import AdaptiveLayerNorm, LayerNorm
+class ValleNAR(nn.Module):
+    def __init__(
+        self,
+        phone_vocab_size=256,
+        target_vocab_size=1024,
+        hidden_size=1024,
+        intermediate_size=4096,
+        num_hidden_layers=12,
+        num_attention_heads=16,
+        pad_token_id=1024+256,
+        bos_target_id=1282,
+        eos_target_id=1283,
+        bos_phone_id=1284,
+        eos_phone_id=1285,
+        bos_prompt_id=1286,
+        eos_prompt_id=1287,
+        use_input_embeds=False,
+        emb_dim=256,
+        num_quantizers=8,
+    ):
+        super(ValleNAR, self).__init__()
+        self.phone_vocab_size = phone_vocab_size
+        self.target_vocab_size = target_vocab_size
+        self.pad_token_id = pad_token_id
+        self.bos_target_id = bos_target_id
+        self.eos_target_id = eos_target_id
+        self.bos_phone_id = bos_phone_id
+        self.eos_phone_id = eos_phone_id
+        self.bos_prompt_id = bos_prompt_id
+        self.eos_prompt_id = eos_prompt_id
+        self.phone_embedder = TokenEmbedding(hidden_size, phone_vocab_size)
+        self.audio_embeddings = nn.ModuleList(
+            [
+                TokenEmbedding(hidden_size, target_vocab_size+1)
+            ] + [
+                TokenEmbedding(hidden_size, target_vocab_size)
+                for i in range(num_quantizers-1)
+            ]
+        )
+        from modules.transformer.transformer import TransformerEncoder, TransformerEncoderLayer
+        self.decoder = TransformerEncoder(
+            TransformerEncoderLayer(
+                hidden_size,
+                num_attention_heads,
+                dim_feedforward=int(4*hidden_size),
+                dropout=0.1,
+                batch_first=True,
+                norm_first=True,
+                adaptive_layer_norm=True,
+                activation=F.silu,
+            ),
+            num_layers=num_hidden_layers,
+            norm=(
+                AdaptiveLayerNorm(
+                    hidden_size, norm=nn.LayerNorm(hidden_size)
+                )
+            )
+        )
+        self.predict_layers = nn.ModuleList(
+            [
+                nn.Linear(hidden_size, target_vocab_size, bias=False)
+                for i in range(num_quantizers-1)
+            ]
+        )
+        self.stage_embedding = nn.ModuleList(
+            [TokenEmbedding(hidden_size, 1) for i in range(num_quantizers)]
+        )
+        self.text_position = SinePositionalEmbedding(
+            hidden_size,
+            dropout=0.1,
+            scale=False,
+            alpha=True,
+        )
+        self.audio_position = SinePositionalEmbedding(
+            hidden_size,
+            dropout=0.1,
+            scale=False,
+            alpha=True,
+        )
+    def _mask_out_acoustic_tokens(self, target_ids, target_quantization_layer, start_time=NUM_PROMPT_TOKENS+1):
+            '''Mask out target_ids after the target_quantization_layer, except for the first 240 tokens.
+            target_ids: [8, B, T], which is padded and added with bos and eos tokens
+            target_quantization_layer: int
+            returns: [8, B, T] masked input_token_ids
+            '''
+            mask = torch.ones_like(target_ids, dtype=torch.long, device=target_ids.device)
+            mask[target_quantization_layer:, :, start_time:] = 0
+            input_token_ids = target_ids * mask
+            input_token_ids += (1-mask)*self.mask_target_id
+            return input_token_ids
+    def forward(
+        self, phone_ids, phone_mask, target_ids, target_mask, input_embeds=None,
+        target_quantization_layer=None,
+    ):
+        '''
+        phone_ids: [B, T]
+        phone_mask: [B, T]
+        target_ids: [8,B,T]
+        '''
+        target_ids = target_ids * target_mask
+        phone_label = torch.ones_like(phone_ids, dtype=torch.long) * -100
+        # get phone embedding
+        phone_embedding = self.phone_embedder(phone_ids) # [B, T, H]
+        phone_embedding = self.text_position(phone_embedding)
+        # randomly select a target to predict
+        # total quant layer is 0 to 7
+        if target_quantization_layer is None:
+            target_quantization_layer = np.random.randint(1, 8)
+        # extract 8-level prompts
+        prompt_tokens = target_ids[:, :, :NUM_PROMPT_TOKENS]
+        prompt_mask = torch.ones_like(prompt_tokens[0])
+        # prompt_label = -100 * prompt_mask
+        prompt_label = prompt_tokens[target_quantization_layer]
+        # get prompt embedding
+        prompt_embedding = self.audio_embeddings[0](prompt_tokens[0]) # [B, T, H]
+        for i in range(1, 8):
+            prompt_embedding += self.audio_embeddings[i](prompt_tokens[i])
+        # get y embedding
+        y_mask = target_mask[..., NUM_PROMPT_TOKENS:]
+        y_tokens = target_ids[:target_quantization_layer, :, NUM_PROMPT_TOKENS:] * y_mask
+        y_label = target_ids[target_quantization_layer, :, NUM_PROMPT_TOKENS:] * y_mask + -100*(1-y_mask)
+        y_embedding = self.audio_embeddings[0](y_tokens[0])
+        for i in range(1, target_quantization_layer):
+            y_embedding += self.audio_embeddings[i](y_tokens[i])
+        # concat y embedding and prmpt embedding
+        y_embedding = torch.concat([prompt_embedding, y_embedding], dim=1)
+        y_embedding = self.audio_position(y_embedding)
+        xy_pos = torch.concat([phone_embedding, y_embedding], dim=1)
+        xy_padding_mask = ~torch.concat([phone_mask, prompt_mask, y_mask], dim=1).to(torch.bool)
+        xy_dec, _ = self.decoder(
+            (xy_pos, self.stage_embedding[target_quantization_layer-1].weight),
+            src_key_padding_mask=xy_padding_mask,
+        )
+        target_label = torch.concat([phone_label, prompt_label, y_label], dim=1)
+        logits = self.predict_layers[target_quantization_layer-1](xy_dec).permute(0, 2, 1)
+        loss = CrossEntropyLoss()(logits, target_label)
+        out = edict(
+            loss=loss,
+            logits=logits,
+        )
+        return out
+        # # prompt eos embedding
+        # prompt_eos_embedding = self.phone_embedder(torch.tensor(self.eos_prompt_id-self.target_vocab_size, device=phone_ids.device).reshape(1).expand(phone_ids.shape[0], -1)) # [B, 1, H]
+        # # input embeddings
+        # input_embeddings = torch.cat([phone_embedding, prompt_embedding, prompt_eos_embedding, target_embedding], dim=1)
+        # input_mask = torch.cat([phone_mask, prompt_mask, torch.ones((phone_mask.shape[0], 1), dtype=torch.long, device=phone_mask.device), target_mask], dim=1) # [B, T]
+        # prediction_target = torch.cat([phone_label, prompt_label, -100*torch.ones((phone_mask.shape[0], 1), dtype=torch.long, device=phone_mask.device), target_labels], dim=1) # [B, T]
+        # out = self.model(
+        #     cond=torch.tensor(target_quantization_layer, device=prediction_target.device, dtype=torch.long),
+        #     input_ids=input_embeddings,
+        #     prediction_target=prediction_target,
+        #     attention_mask=input_mask,
+        #     return_dict=True,
+        # )
+        # return out
+    def add_phone_eos_bos_label(
+        self, phone_ids, phone_mask, phone_eos_id, phone_bos_id, pad_token_id
+    ):
+        # phone_ids: [B, T]
+        # phone_mask: [B, T]
+        phone_ids = phone_ids + self.target_vocab_size * phone_mask
+        phone_ids = phone_ids * phone_mask
+        phone_ids = F.pad(phone_ids, (0, 1), value=0) + phone_eos_id * F.pad(
+            1 - phone_mask, (0, 1), value=1
+        ) # make pad token eos token, add eos token at the end
+        phone_mask = F.pad(phone_mask, (1, 0), value=1) # add eos mask
+        phone_ids = phone_ids * phone_mask + pad_token_id * (1 - phone_mask) # restore pad token ids
+        phone_ids = F.pad(phone_ids, (1, 0), value=phone_bos_id) # add bos token
+        phone_mask = F.pad(phone_mask, (1, 0), value=1) # add bos mask
+        phone_label = -100 * torch.ones_like(phone_ids) # loss for entire phone is not computed (passed to llama)
+        return phone_ids, phone_mask, phone_label
+    @torch.no_grad()
+    def sample_hf(
+        self,
+        phone_ids, # [B, T]
+        prompt_ids, # [8, B, T]
+        first_stage_ids, # [B, T]
+    ):
+        '''
+        phone_ids: [B, T]
+        prompt_ids: [8, B, T]
+        first_stage_ids: [B, T] result from first quant layer. Should be continuation of prompt_ids
+        '''
+        phone_mask = torch.ones_like(phone_ids, dtype=torch.long)
+        assert prompt_ids.shape[-1] >= NUM_PROMPT_TOKENS, "prompt_ids should have at least 240 tokens"
+        prompt_ids = prompt_ids[:, :, :NUM_PROMPT_TOKENS]
+        target_ids = torch.cat([prompt_ids, first_stage_ids.expand(prompt_ids.shape[0],-1,-1)], dim=-1)
+        target_mask = torch.ones_like(target_ids[0], dtype=torch.long)
+        gen_len = first_stage_ids.shape[-1]
+        for qnt_level in range(1, 8):
+            out = self.forward(
+                phone_ids=phone_ids,
+                phone_mask=phone_mask,
+                target_ids=target_ids,
+                target_mask=target_mask,
+                target_quantization_layer=qnt_level,
+            )
+            logits = out.logits
+            gen_tokens = torch.argmax(logits, dim=1)[0, -gen_len:] # [T], generated tokens in this level
+            # overwrite the target_ids with the generated tokens
+            target_ids[qnt_level, :, -gen_len:] = gen_tokens
+        return target_ids[:, :, -gen_len:]
+def test():
+    model = ValleNAR().cuda()
+    model.apply(initialize)
+    phone_ids = torch.LongTensor([1,2,3,4,5]).reshape(1,-1).cuda()
+    phone_mask = torch.LongTensor([1,1,1,1,1]).reshape(1,-1).cuda()
+    target_ids = torch.randint(high=1024, size=(8,1,250), dtype=torch.long).cuda()
+    target_mask = torch.ones(1,250, dtype=torch.long).cuda()
+    optimizer = torch.optim.Adam(model.parameters(), lr=4e-4)
+    for i in range(200):
+        optimizer.zero_grad()
+        out = model(
+            phone_ids=phone_ids,
+            phone_mask=phone_mask,
+            target_ids=target_ids,
+            target_mask=target_mask,
+            target_quantization_layer=1+i%7,
+        )
+        loss = out.loss
+        loss.backward()
+        optimizer.step()
+        print(f"iter={i}, {loss}.")
+    target_ids_short = target_ids[:, :, :240]
+    sampled = model.sample_hf(phone_ids, prompt_ids=target_ids_short, first_stage_ids=target_ids[0, :, 240:])
+    breakpoint()
+    print(target_ids[:,:,-10:])
+    print(sampled)
+    print((sampled == target_ids[:,:,-10:]).all())
+if __name__ == '__main__':
+    test()

modules/__init__.py ADDED Viewed

File without changes

modules/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (146 Bytes). View file

modules/activation_functions/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .gated_activation_unit import GaU
+from .snake import Snake, SnakeBeta

modules/activation_functions/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (271 Bytes). View file

modules/activation_functions/__pycache__/gated_activation_unit.cpython-39.pyc ADDED Viewed

Binary file (1.75 kB). View file

modules/activation_functions/__pycache__/snake.cpython-39.pyc ADDED Viewed

Binary file (3.69 kB). View file

modules/activation_functions/gated_activation_unit.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from modules.general.utils import Conv1d
+class GaU(nn.Module):
+    r"""Gated Activation Unit (GaU) proposed in `Gated Activation Units for Neural
+    Networks <https://arxiv.org/pdf/1606.05328.pdf>`_.
+    Args:
+        channels: number of input channels.
+        kernel_size: kernel size of the convolution.
+        dilation: dilation rate of the convolution.
+        d_context: dimension of context tensor, None if don't use context.
+    """
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int = 3,
+        dilation: int = 1,
+        d_context: int = None,
+    ):
+        super().__init__()
+        self.context = d_context
+        self.conv = Conv1d(
+            channels,
+            channels * 2,
+            kernel_size,
+            dilation=dilation,
+            padding=dilation * (kernel_size - 1) // 2,
+        )
+        if self.context:
+            self.context_proj = Conv1d(d_context, channels * 2, 1)
+    def forward(self, x: torch.Tensor, context: torch.Tensor = None):
+        r"""Calculate forward propagation.
+        Args:
+            x: input tensor with shape [B, C, T].
+            context: context tensor with shape [B, ``d_context``, T], default to None.
+        """
+        h = self.conv(x)
+        if self.context:
+            h = h + self.context_proj(context)
+        h1, h2 = h.chunk(2, 1)
+        h = torch.tanh(h1) * torch.sigmoid(h2)
+        return h

modules/activation_functions/snake.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch import nn, pow, sin
+from torch.nn import Parameter
+class Snake(nn.Module):
+    r"""Implementation of a sine-based periodic activation function.
+    Alpha is initialized to 1 by default, higher values means higher frequency.
+    It will be trained along with the rest of your model.
+    Args:
+        in_features: shape of the input
+        alpha: trainable parameter
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    References:
+        This activation function is from this paper by Liu Ziyin, Tilman Hartwig,
+        Masahito Ueda: https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = Snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+    def __init__(
+        self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
+    ):
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        r"""Forward pass of the function. Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (ax)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+class SnakeBeta(nn.Module):
+    r"""A modified Snake function which uses separate parameters for the magnitude
+    of the periodic components. Alpha is initialized to 1 by default,
+    higher values means higher frequency. Beta is initialized to 1 by default,
+    higher values means higher magnitude. Both will be trained along with the
+    rest of your model.
+    Args:
+        in_features: shape of the input
+        alpha: trainable parameter that controls frequency
+        beta: trainable parameter that controls magnitude
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    References:
+        This activation function is a modified version based on this paper by Liu Ziyin,
+        Tilman Hartwig, Masahito Ueda: https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = SnakeBeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+    def __init__(
+        self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
+    ):
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        r"""Forward pass of the function. Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x

modules/anti_aliasing/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .act import *
+from .filter import *
+from .resample import *

modules/anti_aliasing/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (218 Bytes). View file

modules/anti_aliasing/__pycache__/act.cpython-39.pyc ADDED Viewed

Binary file (1 kB). View file

modules/anti_aliasing/__pycache__/filter.cpython-39.pyc ADDED Viewed

Binary file (2.6 kB). View file

modules/anti_aliasing/__pycache__/resample.cpython-39.pyc ADDED Viewed

Binary file (1.91 kB). View file

modules/anti_aliasing/act.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch.nn as nn
+from .resample import *
+# This code is adopted from BigVGAN under the MIT License
+# https://github.com/NVIDIA/BigVGAN
+class Activation1d(nn.Module):
+    def __init__(
+        self,
+        activation,
+        up_ratio: int = 2,
+        down_ratio: int = 2,
+        up_kernel_size: int = 12,
+        down_kernel_size: int = 12,
+    ):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+        return x

modules/anti_aliasing/filter.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+if "sinc" in dir(torch):
+    sinc = torch.sinc
+else:
+    # This code is adopted from adefossez's julius.core.sinc under the MIT License
+    # https://adefossez.github.io/julius/julius/core.html
+    def sinc(x: torch.Tensor):
+        """
+        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+        """
+        return torch.where(
+            x == 0,
+            torch.tensor(1.0, device=x.device, dtype=x.dtype),
+            torch.sin(math.pi * x) / math.pi / x,
+        )
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+def kaiser_sinc_filter1d(
+    cutoff, half_width, kernel_size
+):  # return filter [1,1,kernel_size]
+    even = kernel_size % 2 == 0
+    half_size = kernel_size // 2
+    # For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.0:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.0:
+        beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
+    else:
+        beta = 0.0
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = torch.arange(-half_size, half_size) + 0.5
+    else:
+        time = torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+        # Normalize filter to have sum = 1, otherwise we will have a small leakage
+        # of the constant component in the input signal.
+        filter_ /= filter_.sum()
+        filter = filter_.view(1, 1, kernel_size)
+    return filter
+class LowPassFilter1d(nn.Module):
+    def __init__(
+        self,
+        cutoff=0.5,
+        half_width=0.6,
+        stride: int = 1,
+        padding: bool = True,
+        padding_mode: str = "replicate",
+        kernel_size: int = 12,
+    ):
+        # kernel_size should be even number for stylegan3 setup,
+        # in this implementation, odd number is also possible.
+        super().__init__()
+        if cutoff < -0.0:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = kernel_size % 2 == 0
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter)
+    # input [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
+        out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+        return out

modules/anti_aliasing/resample.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#################### Anti-aliasing ####################
+import torch.nn as nn
+from torch.nn import functional as F
+from .filter import *
+# This code is adopted from BigVGAN under the MIT License
+# https://github.com/NVIDIA/BigVGAN
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = (
+            int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        )
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = (
+            self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        )
+        filter = kaiser_sinc_filter1d(
+            cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size
+        )
+        self.register_buffer("filter", filter)
+    # x: [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+        x = F.pad(x, (self.pad, self.pad), mode="replicate")
+        x = self.ratio * F.conv_transpose1d(
+            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
+        )
+        x = x[..., self.pad_left : -self.pad_right]
+        return x
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = (
+            int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        )
+        self.lowpass = LowPassFilter1d(
+            cutoff=0.5 / ratio,
+            half_width=0.6 / ratio,
+            stride=ratio,
+            kernel_size=self.kernel_size,
+        )
+    def forward(self, x):
+        xx = self.lowpass(x)
+        return xx

modules/base/base_module.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch import nn
+from torch.nn import functional as F
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+class ConvReluNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels,
+        out_channels,
+        kernel_size,
+        n_layers,
+        p_dropout,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        assert n_layers > 1, "Number of layers should be larger than 0."
+        self.conv_layers = nn.ModuleList()
+        self.norm_layers = nn.ModuleList()
+        self.conv_layers.append(
+            nn.Conv1d(
+                in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
+            )
+        )
+        self.norm_layers.append(LayerNorm(hidden_channels))
+        self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(
+                nn.Conv1d(
+                    hidden_channels,
+                    hidden_channels,
+                    kernel_size,
+                    padding=kernel_size // 2,
+                )
+            )
+            self.norm_layers.append(LayerNorm(hidden_channels))
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask):
+        x_org = x
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.norm_layers[i](x)
+            x = self.relu_drop(x)
+        x = x_org + self.proj(x)
+        return x * x_mask

modules/diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .bidilconv.bidilated_conv import BiDilConv
+from .unet.unet import UNet

modules/diffusion/bidilconv/bidilated_conv.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch.nn as nn
+from modules.general.utils import Conv1d, zero_module
+from .residual_block import ResidualBlock
+class BiDilConv(nn.Module):
+    r"""Dilated CNN architecture with residual connections, default diffusion decoder.
+    Args:
+        input_channel: The number of input channels.
+        base_channel: The number of base channels.
+        n_res_block: The number of residual blocks.
+        conv_kernel_size: The kernel size of convolutional layers.
+        dilation_cycle_length: The cycle length of dilation.
+        conditioner_size: The size of conditioner.
+    """
+    def __init__(
+        self,
+        input_channel,
+        base_channel,
+        n_res_block,
+        conv_kernel_size,
+        dilation_cycle_length,
+        conditioner_size,
+        output_channel: int = -1,
+    ):
+        super().__init__()
+        self.input_channel = input_channel
+        self.base_channel = base_channel
+        self.n_res_block = n_res_block
+        self.conv_kernel_size = conv_kernel_size
+        self.dilation_cycle_length = dilation_cycle_length
+        self.conditioner_size = conditioner_size
+        self.output_channel = output_channel if output_channel > 0 else input_channel
+        self.input = nn.Sequential(
+            Conv1d(
+                input_channel,
+                base_channel,
+                1,
+            ),
+            nn.ReLU(),
+        )
+        self.residual_blocks = nn.ModuleList(
+            [
+                ResidualBlock(
+                    channels=base_channel,
+                    kernel_size=conv_kernel_size,
+                    dilation=2 ** (i % dilation_cycle_length),
+                    d_context=conditioner_size,
+                )
+                for i in range(n_res_block)
+            ]
+        )
+        self.out_proj = nn.Sequential(
+            Conv1d(
+                base_channel,
+                base_channel,
+                1,
+            ),
+            nn.ReLU(),
+            zero_module(
+                Conv1d(
+                    base_channel,
+                    self.output_channel,
+                    1,
+                ),
+            ),
+        )
+    def forward(self, x, y, context=None):
+        """
+        Args:
+            x: Noisy mel-spectrogram [B x ``n_mel`` x L]
+            y: FILM embeddings with the shape of (B, ``base_channel``)
+            context: Context with the shape of [B x ``d_context`` x L], default to None.
+        """
+        h = self.input(x)
+        skip = None
+        for i in range(self.n_res_block):
+            h, skip_connection = self.residual_blocks[i](h, y, context)
+            skip = skip_connection if skip is None else skip_connection + skip
+        out = skip / math.sqrt(self.n_res_block)
+        out = self.out_proj(out)
+        return out

modules/diffusion/bidilconv/residual_block.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn as nn
+from modules.activation_functions import GaU
+from modules.general.utils import Conv1d
+class ResidualBlock(nn.Module):
+    r"""Residual block with dilated convolution, main portion of ``BiDilConv``.
+    Args:
+        channels: The number of channels of input and output.
+        kernel_size: The kernel size of dilated convolution.
+        dilation: The dilation rate of dilated convolution.
+        d_context: The dimension of content encoder output, None if don't use context.
+    """
+    def __init__(
+        self,
+        channels: int = 256,
+        kernel_size: int = 3,
+        dilation: int = 1,
+        d_context: int = None,
+    ):
+        super().__init__()
+        self.context = d_context
+        self.gau = GaU(
+            channels,
+            kernel_size,
+            dilation,
+            d_context,
+        )
+        self.out_proj = Conv1d(
+            channels,
+            channels * 2,
+            1,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        y_emb: torch.Tensor,
+        context: torch.Tensor = None,
+    ):
+        """
+        Args:
+            x: Latent representation inherited from previous residual block
+                with the shape of [B x C x T].
+            y_emb: Embeddings with the shape of [B x C], which will be FILM on the x.
+            context: Context with the shape of [B x ``d_context`` x T], default to None.
+        """
+        h = x + y_emb[..., None]
+        if self.context:
+            h = self.gau(h, context)
+        else:
+            h = self.gau(h)
+        h = self.out_proj(h)
+        res, skip = h.chunk(2, 1)
+        return (res + x) / math.sqrt(2.0), skip

modules/diffusion/karras/karras_diffusion.py ADDED Viewed

	@@ -0,0 +1,977 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Based on: https://github.com/crowsonkb/k-diffusion
+"""
+import random
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+# from piq import LPIPS
+from utils.ssim import SSIM
+from modules.diffusion.karras.random_utils import get_generator
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+        )
+    return x[(...,) + (None,) * dims_to_append]
+def append_zero(x):
+    return th.cat([x, x.new_zeros([1])])
+def get_weightings(weight_schedule, snrs, sigma_data):
+    if weight_schedule == "snr":
+        weightings = snrs
+    elif weight_schedule == "snr+1":
+        weightings = snrs + 1
+    elif weight_schedule == "karras":
+        weightings = snrs + 1.0 / sigma_data**2
+    elif weight_schedule == "truncated-snr":
+        weightings = th.clamp(snrs, min=1.0)
+    elif weight_schedule == "uniform":
+        weightings = th.ones_like(snrs)
+    else:
+        raise NotImplementedError()
+    return weightings
+class KarrasDenoiser:
+    def __init__(
+        self,
+        sigma_data: float = 0.5,
+        sigma_max=80.0,
+        sigma_min=0.002,
+        rho=7.0,
+        weight_schedule="karras",
+        distillation=False,
+        loss_norm="l2",
+    ):
+        self.sigma_data = sigma_data
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.weight_schedule = weight_schedule
+        self.distillation = distillation
+        self.loss_norm = loss_norm
+        # if loss_norm == "lpips":
+        #     self.lpips_loss = LPIPS(replace_pooling=True, reduction="none")
+        if loss_norm == "ssim":
+            self.ssim_loss = SSIM()
+        self.rho = rho
+        self.num_timesteps = 40
+    def get_snr(self, sigmas):
+        return sigmas**-2
+    def get_sigmas(self, sigmas):
+        return sigmas
+    def get_scalings(self, sigma):
+        c_skip = self.sigma_data**2 / (sigma**2 + self.sigma_data**2)
+        c_out = sigma * self.sigma_data / (sigma**2 + self.sigma_data**2) ** 0.5
+        c_in = 1 / (sigma**2 + self.sigma_data**2) ** 0.5
+        return c_skip, c_out, c_in
+    def get_scalings_for_boundary_condition(self, sigma):
+        c_skip = self.sigma_data**2 / (
+            (sigma - self.sigma_min) ** 2 + self.sigma_data**2
+        )
+        c_out = (
+            (sigma - self.sigma_min)
+            * self.sigma_data
+            / (sigma**2 + self.sigma_data**2) ** 0.5
+        )
+        c_in = 1 / (sigma**2 + self.sigma_data**2) ** 0.5
+        return c_skip, c_out, c_in
+    def training_losses(self, model, x_start, sigmas, condition=None, noise=None):
+        if noise is None:
+            noise = th.randn_like(x_start)
+        terms = {}
+        dims = x_start.ndim
+        x_t = x_start + noise * append_dims(sigmas, dims)
+        model_output, denoised = self.denoise(model, x_t, sigmas, condition)
+        snrs = self.get_snr(sigmas)
+        weights = append_dims(
+            get_weightings(self.weight_schedule, snrs, self.sigma_data), dims
+        )
+        # terms["xs_mse"] = mean_flat((denoised - x_start) ** 2)
+        terms["mse"] = mean_flat(weights * (denoised - x_start) ** 2)
+        # terms["mae"] = mean_flat(weights * th.abs(denoised - x_start))
+        # terms["mse"] = nn.MSELoss(reduction="none")(denoised, x_start)
+        # if "vb" in terms:
+        #     terms["loss"] = terms["mse"] + terms["vb"]
+        # else:
+        terms["loss"] = terms["mse"]
+        return terms
+    def consistency_losses(
+        self,
+        model,
+        x_start,
+        num_scales,
+        # model_kwargs=None,
+        condition=None,
+        target_model=None,
+        teacher_model=None,
+        teacher_diffusion=None,
+        noise=None,
+    ):
+        if noise is None:
+            noise = th.randn_like(x_start)
+        dims = x_start.ndim
+        def denoise_fn(x, t):
+            return self.denoise(model, x, t, condition)[1]
+        if target_model:
+            @th.no_grad()
+            def target_denoise_fn(x, t):
+                return self.denoise(target_model, x, t, condition)[1]
+        else:
+            raise NotImplementedError("Must have a target model")
+        if teacher_model:
+            @th.no_grad()
+            def teacher_denoise_fn(x, t):
+                return teacher_diffusion.denoise(teacher_model, x, t, condition)[1]
+        @th.no_grad()
+        def heun_solver(samples, t, next_t, x0):
+            x = samples
+            if teacher_model is None:
+                denoiser = x0
+            else:
+                denoiser = teacher_denoise_fn(x, t)
+            d = (x - denoiser) / append_dims(t, dims)
+            samples = x + d * append_dims(next_t - t, dims)
+            if teacher_model is None:
+                denoiser = x0
+            else:
+                denoiser = teacher_denoise_fn(samples, next_t)
+            next_d = (samples - denoiser) / append_dims(next_t, dims)
+            samples = x + (d + next_d) * append_dims((next_t - t) / 2, dims)
+            return samples
+        @th.no_grad()
+        def euler_solver(samples, t, next_t, x0):
+            x = samples
+            if teacher_model is None:
+                denoiser = x0
+            else:
+                denoiser = teacher_denoise_fn(x, t)
+            d = (x - denoiser) / append_dims(t, dims)
+            samples = x + d * append_dims(next_t - t, dims)
+            return samples
+        indices = th.randint(
+            0, num_scales - 1, (x_start.shape[0],), device=x_start.device
+        )
+        t = self.sigma_max ** (1 / self.rho) + indices / (num_scales - 1) * (
+            self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
+        )
+        t = t**self.rho
+        t2 = self.sigma_max ** (1 / self.rho) + (indices + 1) / (num_scales - 1) * (
+            self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
+        )
+        t2 = t2**self.rho
+        x_t = x_start + noise * append_dims(t, dims)
+        dropout_state = th.get_rng_state()
+        distiller = denoise_fn(x_t, t)
+        if teacher_model is None:
+            x_t2 = euler_solver(x_t, t, t2, x_start).detach()
+        else:
+            x_t2 = heun_solver(x_t, t, t2, x_start).detach()
+        th.set_rng_state(dropout_state)
+        distiller_target = target_denoise_fn(x_t2, t2)
+        distiller_target = distiller_target.detach()
+        snrs = self.get_snr(t)
+        weights = get_weightings(self.weight_schedule, snrs, self.sigma_data)
+        if self.loss_norm == "l1":
+            diffs = th.abs(distiller - distiller_target)
+            loss = mean_flat(diffs) * weights
+        elif self.loss_norm == "l2":
+            # diffs = (distiller - distiller_target) ** 2
+            loss = F.mse_loss(distiller, distiller_target)
+            # loss = mean_flat(diffs) * weights
+        elif self.loss_norm == "ssim":
+            loss = self.ssim_loss(distiller, distiller_target) * weights
+        # elif self.loss_norm == "l2-32":
+        #     distiller = F.interpolate(distiller, size=32, mode="bilinear")
+        #     distiller_target = F.interpolate(
+        #         distiller_target,
+        #         size=32,
+        #         mode="bilinear",
+        #     )
+        #     diffs = (distiller - distiller_target) ** 2
+        #     loss = mean_flat(diffs) * weights
+        # elif self.loss_norm == "lpips":
+        #     if x_start.shape[-1] < 256:
+        #         distiller = F.interpolate(distiller, size=224, mode="bilinear")
+        #         distiller_target = F.interpolate(
+        #             distiller_target, size=224, mode="bilinear"
+        #         )
+        #     loss = (
+        #         self.lpips_loss(
+        #             (distiller + 1) / 2.0,
+        #             (distiller_target + 1) / 2.0,
+        #         )
+        #         * weights
+        #     )
+        else:
+            raise ValueError(f"Unknown loss norm {self.loss_norm}")
+        terms = {}
+        terms["loss"] = loss
+        return terms
+    # def progdist_losses(
+    #     self,
+    #     model,
+    #     x_start,
+    #     num_scales,
+    #     model_kwargs=None,
+    #     teacher_model=None,
+    #     teacher_diffusion=None,
+    #     noise=None,
+    # ):
+    #     if model_kwargs is None:
+    #         model_kwargs = {}
+    #     if noise is None:
+    #         noise = th.randn_like(x_start)
+    #     dims = x_start.ndim
+    #     def denoise_fn(x, t):
+    #         return self.denoise(model, x, t, **model_kwargs)[1]
+    #     @th.no_grad()
+    #     def teacher_denoise_fn(x, t):
+    #         return teacher_diffusion.denoise(teacher_model, x, t, **model_kwargs)[1]
+    #     @th.no_grad()
+    #     def euler_solver(samples, t, next_t):
+    #         x = samples
+    #         denoiser = teacher_denoise_fn(x, t)
+    #         d = (x - denoiser) / append_dims(t, dims)
+    #         samples = x + d * append_dims(next_t - t, dims)
+    #         return samples
+    #     @th.no_grad()
+    #     def euler_to_denoiser(x_t, t, x_next_t, next_t):
+    #         denoiser = x_t - append_dims(t, dims) * (x_next_t - x_t) / append_dims(
+    #             next_t - t, dims
+    #         )
+    #         return denoiser
+    #     indices = th.randint(0, num_scales, (x_start.shape[0],), device=x_start.device)
+    #     t = self.sigma_max ** (1 / self.rho) + indices / num_scales * (
+    #         self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
+    #     )
+    #     t = t**self.rho
+    #     t2 = self.sigma_max ** (1 / self.rho) + (indices + 0.5) / num_scales * (
+    #         self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
+    #     )
+    #     t2 = t2**self.rho
+    #     t3 = self.sigma_max ** (1 / self.rho) + (indices + 1) / num_scales * (
+    #         self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
+    #     )
+    #     t3 = t3**self.rho
+    #     x_t = x_start + noise * append_dims(t, dims)
+    #     denoised_x = denoise_fn(x_t, t)
+    #     x_t2 = euler_solver(x_t, t, t2).detach()
+    #     x_t3 = euler_solver(x_t2, t2, t3).detach()
+    #     target_x = euler_to_denoiser(x_t, t, x_t3, t3).detach()
+    #     snrs = self.get_snr(t)
+    #     weights = get_weightings(self.weight_schedule, snrs, self.sigma_data)
+    #     if self.loss_norm == "l1":
+    #         diffs = th.abs(denoised_x - target_x)
+    #         loss = mean_flat(diffs) * weights
+    #     elif self.loss_norm == "l2":
+    #         diffs = (denoised_x - target_x) ** 2
+    #         loss = mean_flat(diffs) * weights
+    #     elif self.loss_norm == "lpips":
+    #         if x_start.shape[-1] < 256:
+    #             denoised_x = F.interpolate(denoised_x, size=224, mode="bilinear")
+    #             target_x = F.interpolate(target_x, size=224, mode="bilinear")
+    #         loss = (
+    #             self.lpips_loss(
+    #                 (denoised_x + 1) / 2.0,
+    #                 (target_x + 1) / 2.0,
+    #             )
+    #             * weights
+    #         )
+    #     else:
+    #         raise ValueError(f"Unknown loss norm {self.loss_norm}")
+    #     terms = {}
+    #     terms["loss"] = loss
+    #     return terms
+    def denoise(self, model, x_t, sigmas, condition):
+        if not self.distillation:
+            c_skip, c_out, c_in = [
+                append_dims(x, x_t.ndim) for x in self.get_scalings(sigmas)
+            ]
+        else:
+            c_skip, c_out, c_in = [
+                append_dims(x, x_t.ndim)
+                for x in self.get_scalings_for_boundary_condition(sigmas)
+            ]
+        rescaled_t = 1000 * 0.25 * th.log(sigmas + 1e-44)
+        # rescaled_t = rescaled_t[:, None]
+        model_output = model(c_in * x_t, rescaled_t, condition)
+        denoised = c_out * model_output + c_skip * x_t
+        return model_output, denoised
+def karras_sample(
+    diffusion,
+    model,
+    shape,
+    steps,
+    clip_denoised=True,
+    progress=True,
+    callback=None,
+    # model_kwargs=None,
+    condition=None,
+    device=None,
+    sigma_min=0.002,
+    sigma_max=80,  # higher for highres?
+    rho=7.0,
+    sampler="heun",
+    s_churn=0.0,
+    s_tmin=0.0,
+    s_tmax=float("inf"),
+    s_noise=1.0,
+    generator=None,
+    ts=None,
+):
+    if generator is None:
+        generator = get_generator("dummy")
+    if sampler == "progdist":
+        sigmas = get_sigmas_karras(steps + 1, sigma_min, sigma_max, rho, device=device)
+    else:
+        sigmas = get_sigmas_karras(steps, sigma_min, sigma_max, rho, device=device)
+    th.manual_seed(42)
+    x_T = generator.randn(*shape, device=device) * sigma_max
+    sigmas = sigmas.unsqueeze(-1)
+    sample_fn = {
+        "heun": sample_heun,
+        "dpm": sample_dpm,
+        "ancestral": sample_euler_ancestral,
+        "onestep": sample_onestep,
+        "progdist": sample_progdist,
+        "euler": sample_euler,
+        "multistep": stochastic_iterative_sampler,
+    }[sampler]
+    if sampler in ["heun", "dpm"]:
+        sampler_args = dict(
+            s_churn=s_churn, s_tmin=s_tmin, s_tmax=s_tmax, s_noise=s_noise
+        )
+    elif sampler == "multistep":
+        sampler_args = dict(
+            ts=ts, t_min=sigma_min, t_max=sigma_max, rho=diffusion.rho, steps=steps
+        )
+    else:
+        sampler_args = {}
+    def denoiser(x_t, sigma):
+        _, denoised = diffusion.denoise(model, x_t, sigma, condition)
+        if clip_denoised:
+            denoised = denoised.clamp(-1, 1)
+        return denoised
+    x_0 = sample_fn(
+        denoiser,
+        x_T,
+        sigmas,
+        generator,
+        progress=progress,
+        callback=callback,
+        **sampler_args,
+    )
+    return x_0.clamp(-1, 1)
+def get_sigmas_karras(n, sigma_min, sigma_max, rho=7.0, device="cpu"):
+    """Constructs the noise schedule of Karras et al. (2022)."""
+    ramp = th.linspace(0, 1, n)
+    min_inv_rho = sigma_min ** (1 / rho)
+    max_inv_rho = sigma_max ** (1 / rho)
+    sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+    return append_zero(sigmas).to(device)
+def to_d(x, sigma, denoised):
+    """Converts a denoiser output to a Karras ODE derivative."""
+    return (x - denoised) / append_dims(sigma, x.ndim)
+def get_ancestral_step(sigma_from, sigma_to):
+    """Calculates the noise level (sigma_down) to step down to and the amount
+    of noise to add (sigma_up) when doing an ancestral sampling step."""
+    sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+    sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+    return sigma_down, sigma_up
+@th.no_grad()
+def sample_euler_ancestral(model, x, sigmas, generator, progress=False, callback=None):
+    """Ancestral sampling with Euler method steps."""
+    s_in = x.new_ones([x.shape[0]])
+    indices = range(len(sigmas) - 1)
+    if progress:
+        from tqdm.auto import tqdm
+        indices = tqdm(indices)
+    for i in indices:
+        denoised = model(x, sigmas[i] * s_in)
+        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1])
+        if callback is not None:
+            callback(
+                {
+                    "x": x,
+                    "i": i,
+                    "sigma": sigmas[i],
+                    "sigma_hat": sigmas[i],
+                    "denoised": denoised,
+                }
+            )
+        d = to_d(x, sigmas[i], denoised)
+        # Euler method
+        dt = sigma_down - sigmas[i]
+        x = x + d * dt
+        x = x + generator.randn_like(x) * sigma_up
+    return x
+@th.no_grad()
+def sample_midpoint_ancestral(model, x, ts, generator, progress=False, callback=None):
+    """Ancestral sampling with midpoint method steps."""
+    s_in = x.new_ones([x.shape[0]])
+    step_size = 1 / len(ts)
+    if progress:
+        from tqdm.auto import tqdm
+        ts = tqdm(ts)
+    for tn in ts:
+        dn = model(x, tn * s_in)
+        dn_2 = model(x + (step_size / 2) * dn, (tn + step_size / 2) * s_in)
+        x = x + step_size * dn_2
+        if callback is not None:
+            callback({"x": x, "tn": tn, "dn": dn, "dn_2": dn_2})
+    return x
+@th.no_grad()
+def sample_heun(
+    denoiser,
+    x,
+    sigmas,
+    generator,
+    progress=False,
+    callback=None,
+    s_churn=0.0,
+    s_tmin=0.0,
+    s_tmax=float("inf"),
+    s_noise=1.0,
+):
+    """Implements Algorithm 2 (Heun steps) from Karras et al. (2022)."""
+    s_in = x.new_ones([x.shape[0]])
+    indices = range(len(sigmas) - 1)
+    if progress:
+        from tqdm.auto import tqdm
+        indices = tqdm(indices)
+    for i in indices:
+        gamma = (
+            min(s_churn / (len(sigmas) - 1), 2**0.5 - 1)
+            if s_tmin <= sigmas[i] <= s_tmax
+            else 0.0
+        )
+        eps = generator.randn_like(x) * s_noise
+        sigma_hat = sigmas[i] * (gamma + 1)
+        if gamma > 0:
+            x = x + eps * (sigma_hat**2 - sigmas[i] ** 2) ** 0.5
+        denoised = denoiser(x, sigma_hat * s_in)
+        d = to_d(x, sigma_hat, denoised)
+        if callback is not None:
+            callback(
+                {
+                    "x": x,
+                    "i": i,
+                    "sigma": sigmas[i],
+                    "sigma_hat": sigma_hat,
+                    "denoised": denoised,
+                }
+            )
+        dt = sigmas[i + 1] - sigma_hat
+        if sigmas[i + 1] == 0:
+            # Euler method
+            x = x + d * dt
+        else:
+            # Heun's method
+            x_2 = x + d * dt
+            denoised_2 = denoiser(x_2, sigmas[i + 1] * s_in)
+            d_2 = to_d(x_2, sigmas[i + 1], denoised_2)
+            d_prime = (d + d_2) / 2
+            x = x + d_prime * dt
+    return x
+@th.no_grad()
+def sample_euler(
+    denoiser,
+    x,
+    sigmas,
+    generator,
+    progress=False,
+    callback=None,
+):
+    """Implements Algorithm 2 (Heun steps) from Karras et al. (2022)."""
+    s_in = x.new_ones([x.shape[0]])
+    indices = range(len(sigmas) - 1)
+    if progress:
+        from tqdm.auto import tqdm
+        indices = tqdm(indices)
+    for i in indices:
+        sigma = sigmas[i]
+        denoised = denoiser(x, sigma * s_in)
+        d = to_d(x, sigma, denoised)
+        if callback is not None:
+            callback(
+                {
+                    "x": x,
+                    "i": i,
+                    "sigma": sigmas[i],
+                    "denoised": denoised,
+                }
+            )
+        dt = sigmas[i + 1] - sigma
+        x = x + d * dt
+    return x
+@th.no_grad()
+def sample_dpm(
+    denoiser,
+    x,
+    sigmas,
+    generator,
+    progress=False,
+    callback=None,
+    s_churn=0.0,
+    s_tmin=0.0,
+    s_tmax=float("inf"),
+    s_noise=1.0,
+):
+    """A sampler inspired by DPM-Solver-2 and Algorithm 2 from Karras et al. (2022)."""
+    s_in = x.new_ones([x.shape[0]])
+    indices = range(len(sigmas) - 1)
+    if progress:
+        from tqdm.auto import tqdm
+        indices = tqdm(indices)
+    for i in indices:
+        gamma = (
+            min(s_churn / (len(sigmas) - 1), 2**0.5 - 1)
+            if s_tmin <= sigmas[i] <= s_tmax
+            else 0.0
+        )
+        eps = generator.randn_like(x) * s_noise
+        sigma_hat = sigmas[i] * (gamma + 1)
+        if gamma > 0:
+            x = x + eps * (sigma_hat**2 - sigmas[i] ** 2) ** 0.5
+        denoised = denoiser(x, sigma_hat * s_in)
+        d = to_d(x, sigma_hat, denoised)
+        if callback is not None:
+            callback(
+                {
+                    "x": x,
+                    "i": i,
+                    "sigma": sigmas[i],
+                    "sigma_hat": sigma_hat,
+                    "denoised": denoised,
+                }
+            )
+        # Midpoint method, where the midpoint is chosen according to a rho=3 Karras schedule
+        sigma_mid = ((sigma_hat ** (1 / 3) + sigmas[i + 1] ** (1 / 3)) / 2) ** 3
+        dt_1 = sigma_mid - sigma_hat
+        dt_2 = sigmas[i + 1] - sigma_hat
+        x_2 = x + d * dt_1
+        denoised_2 = denoiser(x_2, sigma_mid * s_in)
+        d_2 = to_d(x_2, sigma_mid, denoised_2)
+        x = x + d_2 * dt_2
+    return x
+@th.no_grad()
+def sample_onestep(
+    distiller,
+    x,
+    sigmas,
+    generator=None,
+    progress=False,
+    callback=None,
+):
+    """Single-step generation from a distilled model."""
+    s_in = x.new_ones([x.shape[0]])
+    return distiller(x, sigmas[0] * s_in)
+@th.no_grad()
+def stochastic_iterative_sampler(
+    distiller,
+    x,
+    sigmas,
+    generator,
+    ts,
+    progress=False,
+    callback=None,
+    t_min=0.002,
+    t_max=80.0,
+    rho=7.0,
+    steps=40,
+):
+    t_max_rho = t_max ** (1 / rho)
+    t_min_rho = t_min ** (1 / rho)
+    s_in = x.new_ones([x.shape[0]])
+    for i in range(len(ts) - 1):
+        t = (t_max_rho + ts[i] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+        x0 = distiller(x, t * s_in)
+        next_t = (t_max_rho + ts[i + 1] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+        next_t = np.clip(next_t, t_min, t_max)
+        x = x0 + generator.randn_like(x) * np.sqrt(next_t**2 - t_min**2)
+    return x
+@th.no_grad()
+def sample_progdist(
+    denoiser,
+    x,
+    sigmas,
+    generator=None,
+    progress=False,
+    callback=None,
+):
+    s_in = x.new_ones([x.shape[0]])
+    sigmas = sigmas[:-1]  # skip the zero sigma
+    indices = range(len(sigmas) - 1)
+    if progress:
+        from tqdm.auto import tqdm
+        indices = tqdm(indices)
+    for i in indices:
+        sigma = sigmas[i]
+        denoised = denoiser(x, sigma * s_in)
+        d = to_d(x, sigma, denoised)
+        if callback is not None:
+            callback(
+                {
+                    "x": x,
+                    "i": i,
+                    "sigma": sigma,
+                    "denoised": denoised,
+                }
+            )
+        dt = sigmas[i + 1] - sigma
+        x = x + d * dt
+    return x
+# @th.no_grad()
+# def iterative_colorization(
+#     distiller,
+#     images,
+#     x,
+#     ts,
+#     t_min=0.002,
+#     t_max=80.0,
+#     rho=7.0,
+#     steps=40,
+#     generator=None,
+# ):
+#     def obtain_orthogonal_matrix():
+#         vector = np.asarray([0.2989, 0.5870, 0.1140])
+#         vector = vector / np.linalg.norm(vector)
+#         matrix = np.eye(3)
+#         matrix[:, 0] = vector
+#         matrix = np.linalg.qr(matrix)[0]
+#         if np.sum(matrix[:, 0]) < 0:
+#             matrix = -matrix
+#         return matrix
+#     Q = th.from_numpy(obtain_orthogonal_matrix()).to(dist_util.dev()).to(th.float32)
+#     mask = th.zeros(*x.shape[1:], device=dist_util.dev())
+#     mask[0, ...] = 1.0
+#     def replacement(x0, x1):
+#         x0 = th.einsum("bchw,cd->bdhw", x0, Q)
+#         x1 = th.einsum("bchw,cd->bdhw", x1, Q)
+#         x_mix = x0 * mask + x1 * (1.0 - mask)
+#         x_mix = th.einsum("bdhw,cd->bchw", x_mix, Q)
+#         return x_mix
+#     t_max_rho = t_max ** (1 / rho)
+#     t_min_rho = t_min ** (1 / rho)
+#     s_in = x.new_ones([x.shape[0]])
+#     images = replacement(images, th.zeros_like(images))
+#     for i in range(len(ts) - 1):
+#         t = (t_max_rho + ts[i] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+#         x0 = distiller(x, t * s_in)
+#         x0 = th.clamp(x0, -1.0, 1.0)
+#         x0 = replacement(images, x0)
+#         next_t = (t_max_rho + ts[i + 1] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+#         next_t = np.clip(next_t, t_min, t_max)
+#         x = x0 + generator.randn_like(x) * np.sqrt(next_t**2 - t_min**2)
+#     return x, images
+# @th.no_grad()
+# def iterative_inpainting(
+#     distiller,
+#     images,
+#     x,
+#     ts,
+#     t_min=0.002,
+#     t_max=80.0,
+#     rho=7.0,
+#     steps=40,
+#     generator=None,
+# ):
+#     from PIL import Image, ImageDraw, ImageFont
+#     image_size = x.shape[-1]
+#     # create a blank image with a white background
+#     img = Image.new("RGB", (image_size, image_size), color="white")
+#     # get a drawing context for the image
+#     draw = ImageDraw.Draw(img)
+#     # load a font
+#     font = ImageFont.truetype("arial.ttf", 250)
+#     # draw the letter "C" in black
+#     draw.text((50, 0), "S", font=font, fill=(0, 0, 0))
+#     # convert the image to a numpy array
+#     img_np = np.array(img)
+#     img_np = img_np.transpose(2, 0, 1)
+#     img_th = th.from_numpy(img_np).to(dist_util.dev())
+#     mask = th.zeros(*x.shape, device=dist_util.dev())
+#     mask = mask.reshape(-1, 7, 3, image_size, image_size)
+#     mask[::2, :, img_th > 0.5] = 1.0
+#     mask[1::2, :, img_th < 0.5] = 1.0
+#     mask = mask.reshape(-1, 3, image_size, image_size)
+#     def replacement(x0, x1):
+#         x_mix = x0 * mask + x1 * (1 - mask)
+#         return x_mix
+#     t_max_rho = t_max ** (1 / rho)
+#     t_min_rho = t_min ** (1 / rho)
+#     s_in = x.new_ones([x.shape[0]])
+#     images = replacement(images, -th.ones_like(images))
+#     for i in range(len(ts) - 1):
+#         t = (t_max_rho + ts[i] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+#         x0 = distiller(x, t * s_in)
+#         x0 = th.clamp(x0, -1.0, 1.0)
+#         x0 = replacement(images, x0)
+#         next_t = (t_max_rho + ts[i + 1] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+#         next_t = np.clip(next_t, t_min, t_max)
+#         x = x0 + generator.randn_like(x) * np.sqrt(next_t**2 - t_min**2)
+#     return x, images
+# @th.no_grad()
+# def iterative_superres(
+#     distiller,
+#     images,
+#     x,
+#     ts,
+#     t_min=0.002,
+#     t_max=80.0,
+#     rho=7.0,
+#     steps=40,
+#     generator=None,
+# ):
+#     patch_size = 8
+#     def obtain_orthogonal_matrix():
+#         vector = np.asarray([1] * patch_size**2)
+#         vector = vector / np.linalg.norm(vector)
+#         matrix = np.eye(patch_size**2)
+#         matrix[:, 0] = vector
+#         matrix = np.linalg.qr(matrix)[0]
+#         if np.sum(matrix[:, 0]) < 0:
+#             matrix = -matrix
+#         return matrix
+#     Q = th.from_numpy(obtain_orthogonal_matrix()).to(dist_util.dev()).to(th.float32)
+#     image_size = x.shape[-1]
+#     def replacement(x0, x1):
+#         x0_flatten = (
+#             x0.reshape(-1, 3, image_size, image_size)
+#             .reshape(
+#                 -1,
+#                 3,
+#                 image_size // patch_size,
+#                 patch_size,
+#                 image_size // patch_size,
+#                 patch_size,
+#             )
+#             .permute(0, 1, 2, 4, 3, 5)
+#             .reshape(-1, 3, image_size**2 // patch_size**2, patch_size**2)
+#         )
+#         x1_flatten = (
+#             x1.reshape(-1, 3, image_size, image_size)
+#             .reshape(
+#                 -1,
+#                 3,
+#                 image_size // patch_size,
+#                 patch_size,
+#                 image_size // patch_size,
+#                 patch_size,
+#             )
+#             .permute(0, 1, 2, 4, 3, 5)
+#             .reshape(-1, 3, image_size**2 // patch_size**2, patch_size**2)
+#         )
+#         x0 = th.einsum("bcnd,de->bcne", x0_flatten, Q)
+#         x1 = th.einsum("bcnd,de->bcne", x1_flatten, Q)
+#         x_mix = x0.new_zeros(x0.shape)
+#         x_mix[..., 0] = x0[..., 0]
+#         x_mix[..., 1:] = x1[..., 1:]
+#         x_mix = th.einsum("bcne,de->bcnd", x_mix, Q)
+#         x_mix = (
+#             x_mix.reshape(
+#                 -1,
+#                 3,
+#                 image_size // patch_size,
+#                 image_size // patch_size,
+#                 patch_size,
+#                 patch_size,
+#             )
+#             .permute(0, 1, 2, 4, 3, 5)
+#             .reshape(-1, 3, image_size, image_size)
+#         )
+#         return x_mix
+#     def average_image_patches(x):
+#         x_flatten = (
+#             x.reshape(-1, 3, image_size, image_size)
+#             .reshape(
+#                 -1,
+#                 3,
+#                 image_size // patch_size,
+#                 patch_size,
+#                 image_size // patch_size,
+#                 patch_size,
+#             )
+#             .permute(0, 1, 2, 4, 3, 5)
+#             .reshape(-1, 3, image_size**2 // patch_size**2, patch_size**2)
+#         )
+#         x_flatten[..., :] = x_flatten.mean(dim=-1, keepdim=True)
+#         return (
+#             x_flatten.reshape(
+#                 -1,
+#                 3,
+#                 image_size // patch_size,
+#                 image_size // patch_size,
+#                 patch_size,
+#                 patch_size,
+#             )
+#             .permute(0, 1, 2, 4, 3, 5)
+#             .reshape(-1, 3, image_size, image_size)
+#         )
+#     t_max_rho = t_max ** (1 / rho)
+#     t_min_rho = t_min ** (1 / rho)
+#     s_in = x.new_ones([x.shape[0]])
+#     images = average_image_patches(images)
+#     for i in range(len(ts) - 1):
+#         t = (t_max_rho + ts[i] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+#         x0 = distiller(x, t * s_in)
+#         x0 = th.clamp(x0, -1.0, 1.0)
+#         x0 = replacement(images, x0)
+#         next_t = (t_max_rho + ts[i + 1] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+#         next_t = np.clip(next_t, t_min, t_max)
+#         x = x0 + generator.randn_like(x) * np.sqrt(next_t**2 - t_min**2)
+#     return x, images

modules/diffusion/karras/random_utils.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch as th
+def get_generator(generator, num_samples=0, seed=0):
+    if generator == "dummy":
+        return DummyGenerator()
+    elif generator == "determ":
+        return DeterministicGenerator(num_samples, seed)
+    elif generator == "determ-indiv":
+        return DeterministicIndividualGenerator(num_samples, seed)
+    else:
+        raise NotImplementedError
+class DummyGenerator:
+    def randn(self, *args, **kwargs):
+        return th.randn(*args, **kwargs)
+    def randint(self, *args, **kwargs):
+        return th.randint(*args, **kwargs)
+    def randn_like(self, *args, **kwargs):
+        return th.randn_like(*args, **kwargs)
+class DeterministicGenerator:
+    """
+    RNG to deterministically sample num_samples samples that does not depend on batch_size or mpi_machines
+    Uses a single rng and samples num_samples sized randomness and subsamples the current indices
+    """
+    def __init__(self, num_samples, seed=0):
+        print("Warning: Distributed not initialised, using single rank")
+        self.rank = 0
+        self.world_size = 1
+        self.num_samples = num_samples
+        self.done_samples = 0
+        self.seed = seed
+        self.rng_cpu = th.Generator()
+        if th.cuda.is_available():
+            self.rng_cuda = th.Generator(dist_util.dev())
+        self.set_seed(seed)
+    def get_global_size_and_indices(self, size):
+        global_size = (self.num_samples, *size[1:])
+        indices = th.arange(
+            self.done_samples + self.rank,
+            self.done_samples + self.world_size * int(size[0]),
+            self.world_size,
+        )
+        indices = th.clamp(indices, 0, self.num_samples - 1)
+        assert (
+            len(indices) == size[0]
+        ), f"rank={self.rank}, ws={self.world_size}, l={len(indices)}, bs={size[0]}"
+        return global_size, indices
+    def get_generator(self, device):
+        return self.rng_cpu if th.device(device).type == "cpu" else self.rng_cuda
+    def randn(self, *size, dtype=th.float, device="cpu"):
+        global_size, indices = self.get_global_size_and_indices(size)
+        generator = self.get_generator(device)
+        return th.randn(*global_size, generator=generator, dtype=dtype, device=device)[
+            indices
+        ]
+    def randint(self, low, high, size, dtype=th.long, device="cpu"):
+        global_size, indices = self.get_global_size_and_indices(size)
+        generator = self.get_generator(device)
+        return th.randint(
+            low, high, generator=generator, size=global_size, dtype=dtype, device=device
+        )[indices]
+    def randn_like(self, tensor):
+        size, dtype, device = tensor.size(), tensor.dtype, tensor.device
+        return self.randn(*size, dtype=dtype, device=device)
+    def set_done_samples(self, done_samples):
+        self.done_samples = done_samples
+        self.set_seed(self.seed)
+    def get_seed(self):
+        return self.seed
+    def set_seed(self, seed):
+        self.rng_cpu.manual_seed(seed)
+        if th.cuda.is_available():
+            self.rng_cuda.manual_seed(seed)
+class DeterministicIndividualGenerator:
+    """
+    RNG to deterministically sample num_samples samples that does not depend on batch_size or mpi_machines
+    Uses a separate rng for each sample to reduce memoery usage
+    """
+    def __init__(self, num_samples, seed=0):
+        print("Warning: Distributed not initialised, using single rank")
+        self.rank = 0
+        self.world_size = 1
+        self.num_samples = num_samples
+        self.done_samples = 0
+        self.seed = seed
+        self.rng_cpu = [th.Generator() for _ in range(num_samples)]
+        if th.cuda.is_available():
+            self.rng_cuda = [th.Generator(dist_util.dev()) for _ in range(num_samples)]
+        self.set_seed(seed)
+    def get_size_and_indices(self, size):
+        indices = th.arange(
+            self.done_samples + self.rank,
+            self.done_samples + self.world_size * int(size[0]),
+            self.world_size,
+        )
+        indices = th.clamp(indices, 0, self.num_samples - 1)
+        assert (
+            len(indices) == size[0]
+        ), f"rank={self.rank}, ws={self.world_size}, l={len(indices)}, bs={size[0]}"
+        return (1, *size[1:]), indices
+    def get_generator(self, device):
+        return self.rng_cpu if th.device(device).type == "cpu" else self.rng_cuda
+    def randn(self, *size, dtype=th.float, device="cpu"):
+        size, indices = self.get_size_and_indices(size)
+        generator = self.get_generator(device)
+        return th.cat(
+            [
+                th.randn(*size, generator=generator[i], dtype=dtype, device=device)
+                for i in indices
+            ],
+            dim=0,
+        )
+    def randint(self, low, high, size, dtype=th.long, device="cpu"):
+        size, indices = self.get_size_and_indices(size)
+        generator = self.get_generator(device)
+        return th.cat(
+            [
+                th.randint(
+                    low,
+                    high,
+                    generator=generator[i],
+                    size=size,
+                    dtype=dtype,
+                    device=device,
+                )
+                for i in indices
+            ],
+            dim=0,
+        )
+    def randn_like(self, tensor):
+        size, dtype, device = tensor.size(), tensor.dtype, tensor.device
+        return self.randn(*size, dtype=dtype, device=device)
+    def set_done_samples(self, done_samples):
+        self.done_samples = done_samples
+    def get_seed(self):
+        return self.seed
+    def set_seed(self, seed):
+        [
+            rng_cpu.manual_seed(i + self.num_samples * seed)
+            for i, rng_cpu in enumerate(self.rng_cpu)
+        ]
+        if th.cuda.is_available():
+            [
+                rng_cuda.manual_seed(i + self.num_samples * seed)
+                for i, rng_cuda in enumerate(self.rng_cuda)
+            ]

modules/diffusion/karras/sample.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from abc import ABC, abstractmethod
+import numpy as np
+import torch as th
+from scipy.stats import norm
+import torch.distributed as dist
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    elif name == "lognormal":
+        return LogNormalSampler()
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+    def weights(self):
+        return self._weights
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            th.tensor([0], dtype=th.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [diffusion.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history**2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()
+class LogNormalSampler:
+    def __init__(self, p_mean=-1.2, p_std=1.2, even=False):
+        self.p_mean = p_mean
+        self.p_std = p_std
+        self.even = even
+        if self.even:
+            self.inv_cdf = lambda x: norm.ppf(x, loc=p_mean, scale=p_std)
+            self.rank, self.size = dist.get_rank(), dist.get_world_size()
+    def sample(self, bs, device):
+        if self.even:
+            # buckets = [1/G]
+            start_i, end_i = self.rank * bs, (self.rank + 1) * bs
+            global_batch_size = self.size * bs
+            locs = (th.arange(start_i, end_i) + th.rand(bs)) / global_batch_size
+            log_sigmas = th.tensor(self.inv_cdf(locs), dtype=th.float32, device=device)
+        else:
+            log_sigmas = self.p_mean + self.p_std * th.randn(bs, device=device)
+        sigmas = th.exp(log_sigmas)
+        weights = th.ones_like(sigmas)
+        return sigmas, weights

modules/diffusion/unet/attention.py ADDED Viewed

	@@ -0,0 +1,241 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from modules.general.utils import Conv1d, normalization, zero_module
+from .basic import UNetBlock
+class AttentionBlock(UNetBlock):
+    r"""A spatial transformer encoder block that allows spatial positions to attend
+    to each other. Reference from `latent diffusion repo
+    <https://github.com/Stability-AI/generative-models/blob/main/sgm/modules/attention.py#L531>`_.
+    Args:
+        channels: Number of channels in the input.
+        num_head_channels: Number of channels per attention head.
+        num_heads: Number of attention heads. Overrides ``num_head_channels`` if set.
+        encoder_channels: Number of channels in the encoder output for cross-attention.
+            If ``None``, then self-attention is performed.
+        use_self_attention: Whether to use self-attention before cross-attention, only applicable if encoder_channels is set.
+        dims: Number of spatial dimensions, i.e. 1 for temporal signals, 2 for images.
+        h_dim: The dimension of the height, would be applied if ``dims`` is 2.
+        encoder_hdim: The dimension of the height of the encoder output, would be applied if ``dims`` is 2.
+        p_dropout: Dropout probability.
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_head_channels: int = 32,
+        num_heads: int = -1,
+        encoder_channels: int = None,
+        use_self_attention: bool = False,
+        dims: int = 1,
+        h_dim: int = 100,
+        encoder_hdim: int = 384,
+        p_dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.p_dropout = p_dropout
+        self.dims = dims
+        if dims == 1:
+            self.channels = channels
+        elif dims == 2:
+            # We consider the channel as product of channel and height, i.e. C x H
+            # This is because we want to apply attention on the audio signal, which is 1D
+            self.channels = channels * h_dim
+        else:
+            raise ValueError(f"invalid number of dimensions: {dims}")
+        if num_head_channels == -1:
+            assert (
+                self.channels % num_heads == 0
+            ), f"q,k,v channels {self.channels} is not divisible by num_heads {num_heads}"
+            self.num_heads = num_heads
+            self.num_head_channels = self.channels // num_heads
+        else:
+            assert (
+                self.channels % num_head_channels == 0
+            ), f"q,k,v channels {self.channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = self.channels // num_head_channels
+            self.num_head_channels = num_head_channels
+        if encoder_channels is not None:
+            self.use_self_attention = use_self_attention
+            if dims == 1:
+                self.encoder_channels = encoder_channels
+            elif dims == 2:
+                self.encoder_channels = encoder_channels * encoder_hdim
+            else:
+                raise ValueError(f"invalid number of dimensions: {dims}")
+            if use_self_attention:
+                self.self_attention = BasicAttentionBlock(
+                    self.channels,
+                    self.num_head_channels,
+                    self.num_heads,
+                    p_dropout=self.p_dropout,
+                )
+            self.cross_attention = BasicAttentionBlock(
+                self.channels,
+                self.num_head_channels,
+                self.num_heads,
+                self.encoder_channels,
+                p_dropout=self.p_dropout,
+            )
+        else:
+            self.encoder_channels = None
+            self.self_attention = BasicAttentionBlock(
+                self.channels,
+                self.num_head_channels,
+                self.num_heads,
+                p_dropout=self.p_dropout,
+            )
+    def forward(self, x: torch.Tensor, encoder_output: torch.Tensor = None):
+        r"""
+        Args:
+            x: input tensor with shape [B x ``channels`` x ...]
+            encoder_output: feature tensor with shape [B x ``encoder_channels`` x ...], if ``None``, then self-attention is performed.
+        Returns:
+            output tensor with shape [B x ``channels`` x ...]
+        """
+        shape = x.size()
+        x = x.reshape(shape[0], self.channels, -1).contiguous()
+        if self.encoder_channels is None:
+            assert (
+                encoder_output is None
+            ), "encoder_output must be None for self-attention."
+            h = self.self_attention(x)
+        else:
+            assert (
+                encoder_output is not None
+            ), "encoder_output must be given for cross-attention."
+            encoder_output = encoder_output.reshape(
+                shape[0], self.encoder_channels, -1
+            ).contiguous()
+            if self.use_self_attention:
+                x = self.self_attention(x)
+            h = self.cross_attention(x, encoder_output)
+        return h.reshape(*shape).contiguous()
+class BasicAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        num_head_channels: int = 32,
+        num_heads: int = -1,
+        context_channels: int = None,
+        p_dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.p_dropout = p_dropout
+        self.context_channels = context_channels
+        if num_head_channels == -1:
+            assert (
+                self.channels % num_heads == 0
+            ), f"q,k,v channels {self.channels} is not divisible by num_heads {num_heads}"
+            self.num_heads = num_heads
+            self.num_head_channels = self.channels // num_heads
+        else:
+            assert (
+                self.channels % num_head_channels == 0
+            ), f"q,k,v channels {self.channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = self.channels // num_head_channels
+            self.num_head_channels = num_head_channels
+        if context_channels is not None:
+            self.to_q = nn.Sequential(
+                normalization(self.channels),
+                Conv1d(self.channels, self.channels, 1),
+            )
+            self.to_kv = Conv1d(context_channels, 2 * self.channels, 1)
+        else:
+            self.to_qkv = nn.Sequential(
+                normalization(self.channels),
+                Conv1d(self.channels, 3 * self.channels, 1),
+            )
+        self.linear = Conv1d(self.channels, self.channels)
+        self.proj_out = nn.Sequential(
+            normalization(self.channels),
+            Conv1d(self.channels, self.channels, 1),
+            nn.GELU(),
+            nn.Dropout(p=self.p_dropout),
+            zero_module(Conv1d(self.channels, self.channels, 1)),
+        )
+    def forward(self, q: torch.Tensor, kv: torch.Tensor = None):
+        r"""
+        Args:
+            q: input tensor with shape [B, ``channels``, L]
+            kv: feature tensor with shape [B, ``context_channels``, T], if ``None``, then self-attention is performed.
+        Returns:
+            output tensor with shape [B, ``channels``, L]
+        """
+        N, C, L = q.size()
+        if self.context_channels is not None:
+            assert kv is not None, "kv must be given for cross-attention."
+            q = (
+                self.to_q(q)
+                .reshape(self.num_heads, self.num_head_channels, -1)
+                .transpose(-1, -2)
+                .contiguous()
+            )
+            kv = (
+                self.to_kv(kv)
+                .reshape(2, self.num_heads, self.num_head_channels, -1)
+                .transpose(-1, -2)
+                .chunk(2)
+            )
+            k, v = (
+                kv[0].squeeze(0).contiguous(),
+                kv[1].squeeze(0).contiguous(),
+            )
+        else:
+            qkv = (
+                self.to_qkv(q)
+                .reshape(3, self.num_heads, self.num_head_channels, -1)
+                .transpose(-1, -2)
+                .chunk(3)
+            )
+            q, k, v = (
+                qkv[0].squeeze(0).contiguous(),
+                qkv[1].squeeze(0).contiguous(),
+                qkv[2].squeeze(0).contiguous(),
+            )
+        h = F.scaled_dot_product_attention(q, k, v, dropout_p=self.p_dropout).transpose(
+            -1, -2
+        )
+        h = h.reshape(N, -1, L).contiguous()
+        h = self.linear(h)
+        x = q + h
+        h = self.proj_out(x)
+        return x + h

modules/diffusion/unet/basic.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch.nn as nn
+from abc import abstractmethod
+class UNetBlock(nn.Module):
+    r"""Any module where forward() takes timestep embeddings as a second argument."""
+    @abstractmethod
+    def forward(self, x, emb):
+        r"""Apply the module to `x` given `emb` timestep embeddings."""

modules/diffusion/unet/resblock.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .basic import UNetBlock
+from modules.general.utils import (
+    append_dims,
+    ConvNd,
+    normalization,
+    zero_module,
+)
+class ResBlock(UNetBlock):
+    r"""A residual block that can optionally change the number of channels.
+    Args:
+        channels: the number of input channels.
+        emb_channels: the number of timestep embedding channels.
+        dropout: the rate of dropout.
+        out_channels: if specified, the number of out channels.
+        use_conv: if True and out_channels is specified, use a spatial
+            convolution instead of a smaller 1x1 convolution to change the
+            channels in the skip connection.
+        dims: determines if the signal is 1D, 2D, or 3D.
+        up: if True, use this block for upsampling.
+        down: if True, use this block for downsampling.
+    """
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout: float = 0.0,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            ConvNd(dims, channels, self.out_channels, 3, padding=1),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            ConvNd(
+                dims,
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+                1,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                ConvNd(dims, self.out_channels, self.out_channels, 3, padding=1)
+            ),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = ConvNd(
+                dims, channels, self.out_channels, 3, padding=1
+            )
+        else:
+            self.skip_connection = ConvNd(dims, channels, self.out_channels, 1)
+    def forward(self, x, emb):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+            x: an [N x C x ...] Tensor of features.
+            emb: an [N x emb_channels x ...] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb)
+        emb_out = append_dims(emb_out, h.dim())
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+class Upsample(nn.Module):
+    r"""An upsampling layer with an optional convolution.
+    Args:
+        channels: channels in the inputs and outputs.
+        dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+            upsampling occurs in the inner-two dimensions.
+        out_channels: if specified, the number of out channels.
+    """
+    def __init__(self, channels, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.dims = dims
+        self.conv = ConvNd(dims, self.channels, self.out_channels, 3, padding=1)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    r"""A downsampling layer with an optional convolution.
+    Args:
+        channels: channels in the inputs and outputs.
+        dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+            downsampling occurs in the inner-two dimensions.
+        out_channels: if specified, the number of output channels.
+    """
+    def __init__(self, channels, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        self.op = ConvNd(
+            dims, self.channels, self.out_channels, 3, stride=stride, padding=1
+        )
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)

modules/diffusion/unet/unet.py ADDED Viewed

	@@ -0,0 +1,310 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from modules.encoder.position_encoder import PositionEncoder
+from modules.general.utils import append_dims, ConvNd, normalization, zero_module
+from .attention import AttentionBlock
+from .resblock import Downsample, ResBlock, Upsample
+class UNet(nn.Module):
+    r"""The full UNet model with attention and timestep embedding.
+    Args:
+        dims: determines if the signal is 1D (temporal), 2D(spatial).
+        in_channels: channels in the input Tensor.
+        model_channels: base channel count for the model.
+        out_channels: channels in the output Tensor.
+        num_res_blocks: number of residual blocks per downsample.
+        channel_mult: channel multiplier for each level of the UNet.
+        num_attn_blocks: number of attention blocks at place.
+        attention_resolutions: a collection of downsample rates at which attention will
+            take place. May be a set, list, or tuple. For example, if this contains 4,
+            then at 4x downsampling, attention will be used.
+        num_heads: the number of attention heads in each attention layer.
+        num_head_channels: if specified, ignore num_heads and instead use a fixed
+            channel width per attention head.
+        d_context: if specified, use for cross-attention channel project.
+        p_dropout: the dropout probability.
+        use_self_attention: Apply self attention before cross attention.
+        num_classes: if specified (as an int), then this model will be class-conditional
+            with ``num_classes`` classes.
+        use_extra_film: if specified, use an extra FiLM-like conditioning mechanism.
+        d_emb: if specified, use for FiLM-like conditioning.
+        use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+        resblock_updown: use residual blocks for up/downsampling.
+    """
+    def __init__(
+        self,
+        dims: int = 1,
+        in_channels: int = 100,
+        model_channels: int = 128,
+        out_channels: int = 100,
+        h_dim: int = 128,
+        num_res_blocks: int = 1,
+        channel_mult: tuple = (1, 2, 4),
+        num_attn_blocks: int = 1,
+        attention_resolutions: tuple = (1, 2, 4),
+        num_heads: int = 1,
+        num_head_channels: int = -1,
+        d_context: int = None,
+        context_hdim: int = 128,
+        p_dropout: float = 0.0,
+        num_classes: int = -1,
+        use_extra_film: str = None,
+        d_emb: int = None,
+        use_scale_shift_norm: bool = True,
+        resblock_updown: bool = False,
+    ):
+        super().__init__()
+        self.dims = dims
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.channel_mult = channel_mult
+        self.num_attn_blocks = num_attn_blocks
+        self.attention_resolutions = attention_resolutions
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.d_context = d_context
+        self.p_dropout = p_dropout
+        self.num_classes = num_classes
+        self.use_extra_film = use_extra_film
+        self.d_emb = d_emb
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.resblock_updown = resblock_updown
+        time_embed_dim = model_channels * 4
+        self.pos_enc = PositionEncoder(model_channels, time_embed_dim)
+        assert (
+            num_classes == -1 or use_extra_film is None
+        ), "You cannot set both num_classes and use_extra_film."
+        if self.num_classes > 0:
+            # TODO: if used for singer, norm should be 1, correct?
+            self.label_emb = nn.Embedding(num_classes, time_embed_dim, max_norm=1.0)
+        elif use_extra_film is not None:
+            assert (
+                d_emb is not None
+            ), "d_emb must be specified if use_extra_film is not None"
+            assert use_extra_film in [
+                "add",
+                "concat",
+            ], f"use_extra_film only supported by add or concat. Your input is {use_extra_film}"
+            self.use_extra_film = use_extra_film
+            self.film_emb = ConvNd(dims, d_emb, time_embed_dim, 1)
+            if use_extra_film == "concat":
+                time_embed_dim *= 2
+        # Input blocks
+        ch = input_ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList(
+            [UNetSequential(ConvNd(dims, in_channels, ch, 3, padding=1))]
+        )
+        self._feature_size = ch
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        p_dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    for _ in range(num_attn_blocks):
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                num_heads=num_heads,
+                                num_head_channels=num_head_channels,
+                                encoder_channels=d_context,
+                                dims=dims,
+                                h_dim=h_dim // (level + 1),
+                                encoder_hdim=context_hdim,
+                                p_dropout=p_dropout,
+                            )
+                        )
+                self.input_blocks.append(UNetSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    UNetSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            p_dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(ch, dims=dims, out_channels=out_ch)
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        # Middle blocks
+        self.middle_block = UNetSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                p_dropout,
+                dims=dims,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                encoder_channels=d_context,
+                dims=dims,
+                h_dim=h_dim // (level + 1),
+                encoder_hdim=context_hdim,
+                p_dropout=p_dropout,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                p_dropout,
+                dims=dims,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        # Output blocks
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in tuple(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        p_dropout,
+                        out_channels=int(model_channels * mult),
+                        dims=dims,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(model_channels * mult)
+                if ds in attention_resolutions:
+                    for _ in range(num_attn_blocks):
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                num_heads=num_heads,
+                                num_head_channels=num_head_channels,
+                                encoder_channels=d_context,
+                                dims=dims,
+                                h_dim=h_dim // (level + 1),
+                                encoder_hdim=context_hdim,
+                                p_dropout=p_dropout,
+                            )
+                        )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            p_dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(UNetSequential(*layers))
+                self._feature_size += ch
+        # Final proj out
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(ConvNd(dims, input_ch, out_channels, 3, padding=1)),
+        )
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+        r"""Apply the model to an input batch.
+        Args:
+            x: an [N x C x ...] Tensor of inputs.
+            timesteps: a 1-D batch of timesteps, i.e. [N].
+            context: conditioning Tensor with shape of [N x ``d_context`` x ...] plugged
+            in via cross attention.
+            y: an [N] Tensor of labels, if **class-conditional**.
+            an [N x ``d_emb`` x ...] Tensor if **film-embed conditional**.
+        Returns:
+            an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is None) or (
+            (y is not None)
+            and ((self.num_classes > 0) or (self.use_extra_film is not None))
+        ), f"y must be specified if num_classes or use_extra_film is not None. \nGot num_classes: {self.num_classes}\t\nuse_extra_film: {self.use_extra_film}\t\n"
+        hs = []
+        emb = self.pos_enc(timesteps)
+        emb = append_dims(emb, x.dim())
+        if self.num_classes > 0:
+            assert y.size() == (x.size(0),)
+            emb = emb + self.label_emb(y)
+        elif self.use_extra_film is not None:
+            assert y.size() == (x.size(0), self.d_emb, *x.size()[2:])
+            y = self.film_emb(y)
+            if self.use_extra_film == "add":
+                emb = emb + y
+            elif self.use_extra_film == "concat":
+                emb = torch.cat([emb, y], dim=1)
+        h = x
+        for module in self.input_blocks:
+            h = module(h, emb, context)
+            hs.append(h)
+        h = self.middle_block(h, emb, context)
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+        return self.out(h)
+class UNetSequential(nn.Sequential):
+    r"""A sequential module that passes embeddings to the children that support it."""
+    def forward(self, x, emb=None, context=None):
+        for layer in self:
+            if isinstance(layer, ResBlock):
+                x = layer(x, emb)
+            elif isinstance(layer, AttentionBlock):
+                x = layer(x, context)
+            else:
+                x = layer(x)
+        return x

modules/distributions/__init__.py ADDED Viewed

File without changes

modules/distributions/distributions.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import numpy as np
+class AbstractDistribution:
+    def sample(self):
+        raise NotImplementedError()
+    def mode(self):
+        raise NotImplementedError()
+class DiracDistribution(AbstractDistribution):
+    def __init__(self, value):
+        self.value = value
+    def sample(self):
+        return self.value
+    def mode(self):
+        return self.value
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(
+                device=self.parameters.device
+            )
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(
+            device=self.parameters.device
+        )
+        return x
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2, 3],
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=[1, 2, 3],
+                )
+    def nll(self, sample, dims=[1, 2, 3]):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+    def mode(self):
+        return self.mean
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, torch.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for torch.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + torch.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
+    )

modules/duration_predictor/__init__.py ADDED Viewed

File without changes

modules/duration_predictor/standard_duration_predictor.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/jaywalnut310/vits/blob/main/models.py
+import torch
+from torch import nn
+from modules.base.base_module import LayerNorm
+class DurationPredictor(nn.Module):
+    def __init__(
+        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+        self.drop = nn.Dropout(p_dropout)
+        self.conv_1 = nn.Conv1d(
+            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_1 = LayerNorm(filter_channels)
+        self.conv_2 = nn.Conv1d(
+            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_2 = LayerNorm(filter_channels)
+        self.proj = nn.Conv1d(filter_channels, 1, 1)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+    def forward(self, x, x_mask, g=None):
+        x = torch.detach(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_1(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_2(x)
+        x = self.drop(x)
+        x = self.proj(x * x_mask)
+        return x * x_mask

modules/duration_predictor/stochastic_duration_predictor.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/jaywalnut310/vits/blob/main/models.pyimport torch
+from torch import nn
+from torch.nn import functional as F
+import math
+from modules.flow.modules import *
+class StochasticDurationPredictor(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        filter_channels = in_channels
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.log_flow = Log()
+        self.flows = nn.ModuleList()
+        self.flows.append(ElementwiseAffine(2))
+        for i in range(n_flows):
+            self.flows.append(ConvFlow(2, filter_channels, kernel_size, n_layers=3))
+            self.flows.append(Flip())
+        self.post_pre = nn.Conv1d(1, filter_channels, 1)
+        self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.post_convs = DDSConv(
+            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
+        )
+        self.post_flows = nn.ModuleList()
+        self.post_flows.append(ElementwiseAffine(2))
+        for i in range(4):
+            self.post_flows.append(
+                ConvFlow(2, filter_channels, kernel_size, n_layers=3)
+            )
+            self.post_flows.append(Flip())
+        self.pre = nn.Conv1d(in_channels, filter_channels, 1)
+        self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.convs = DDSConv(
+            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
+        )
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
+    def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
+        x = torch.detach(x)
+        x = self.pre(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.convs(x, x_mask)
+        x = self.proj(x) * x_mask
+        if not reverse:
+            flows = self.flows
+            assert w is not None
+            logdet_tot_q = 0
+            h_w = self.post_pre(w)
+            h_w = self.post_convs(h_w, x_mask)
+            h_w = self.post_proj(h_w) * x_mask
+            e_q = (
+                torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype)
+                * x_mask
+            )
+            z_q = e_q
+            for flow in self.post_flows:
+                z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
+                logdet_tot_q += logdet_q
+            z_u, z1 = torch.split(z_q, [1, 1], 1)
+            u = torch.sigmoid(z_u) * x_mask
+            z0 = (w - u) * x_mask
+            logdet_tot_q += torch.sum(
+                (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
+            )
+            logq = (
+                torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
+                - logdet_tot_q
+            )
+            logdet_tot = 0
+            z0, logdet = self.log_flow(z0, x_mask)
+            logdet_tot += logdet
+            z = torch.cat([z0, z1], 1)
+            for flow in flows:
+                z, logdet = flow(z, x_mask, g=x, reverse=reverse)
+                logdet_tot = logdet_tot + logdet
+            nll = (
+                torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
+                - logdet_tot
+            )
+            return nll + logq
+        else:
+            flows = list(reversed(self.flows))
+            flows = flows[:-2] + [flows[-1]]
+            z = (
+                torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
+                * noise_scale
+            )
+            for flow in flows:
+                z = flow(z, x_mask, g=x, reverse=reverse)
+            z0, z1 = torch.split(z, [1, 1], 1)
+            logw = z0
+            return logw

modules/encoder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .token_encoder import TokenEmbedding

modules/encoder/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (206 Bytes). View file

modules/encoder/__pycache__/token_encoder.cpython-39.pyc ADDED Viewed

Binary file (1.08 kB). View file

modules/encoder/condition_encoder.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+import torch.nn as nn
+from torchaudio.models import Conformer
+from models.svc.transformer.transformer import PositionalEncoding
+from utils.f0 import f0_to_coarse
+class ContentEncoder(nn.Module):
+    def __init__(self, cfg, input_dim, output_dim):
+        super().__init__()
+        self.cfg = cfg
+        assert input_dim != 0
+        self.nn = nn.Linear(input_dim, output_dim)
+        # Introduce conformer or not
+        if (
+            "use_conformer_for_content_features" in cfg
+            and cfg.use_conformer_for_content_features
+        ):
+            self.pos_encoder = PositionalEncoding(input_dim)
+            self.conformer = Conformer(
+                input_dim=input_dim,
+                num_heads=2,
+                ffn_dim=256,
+                num_layers=6,
+                depthwise_conv_kernel_size=3,
+            )
+        else:
+            self.conformer = None
+    def forward(self, x, length=None):
+        # x: (N, seq_len, input_dim) -> (N, seq_len, output_dim)
+        if self.conformer:
+            x = self.pos_encoder(x)
+            x, _ = self.conformer(x, length)
+        return self.nn(x)
+class MelodyEncoder(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.input_dim = self.cfg.input_melody_dim
+        self.output_dim = self.cfg.output_melody_dim
+        self.n_bins = self.cfg.n_bins_melody
+        self.pitch_min = self.cfg.pitch_min
+        self.pitch_max = self.cfg.pitch_max
+        if self.input_dim != 0:
+            if self.n_bins == 0:
+                # Not use quantization
+                self.nn = nn.Linear(self.input_dim, self.output_dim)
+            else:
+                self.f0_min = cfg.f0_min
+                self.f0_max = cfg.f0_max
+                self.nn = nn.Embedding(
+                    num_embeddings=self.n_bins,
+                    embedding_dim=self.output_dim,
+                    padding_idx=None,
+                )
+                self.uv_embedding = nn.Embedding(2, self.output_dim)
+                # self.conformer = Conformer(
+                #     input_dim=self.output_dim,
+                #     num_heads=4,
+                #     ffn_dim=128,
+                #     num_layers=4,
+                #     depthwise_conv_kernel_size=3,
+                # )
+    def forward(self, x, uv=None, length=None):
+        # x: (N, frame_len)
+        # print(x.shape)
+        if self.n_bins == 0:
+            x = x.unsqueeze(-1)
+        else:
+            x = f0_to_coarse(x, self.n_bins, self.f0_min, self.f0_max)
+            x = self.nn(x)
+            if uv is not None:
+                uv = self.uv_embedding(uv)
+                x = x + uv
+            # x, _ = self.conformer(x, length)
+        return x
+class LoudnessEncoder(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.input_dim = self.cfg.input_loudness_dim
+        self.output_dim = self.cfg.output_loudness_dim
+        self.n_bins = self.cfg.n_bins_loudness
+        if self.input_dim != 0:
+            if self.n_bins == 0:
+                # Not use quantization
+                self.nn = nn.Linear(self.input_dim, self.output_dim)
+            else:
+                # TODO: set trivially now
+                self.loudness_min = 1e-30
+                self.loudness_max = 1.5
+                if cfg.use_log_loudness:
+                    self.energy_bins = nn.Parameter(
+                        torch.exp(
+                            torch.linspace(
+                                np.log(self.loudness_min),
+                                np.log(self.loudness_max),
+                                self.n_bins - 1,
+                            )
+                        ),
+                        requires_grad=False,
+                    )
+                self.nn = nn.Embedding(
+                    num_embeddings=self.n_bins,
+                    embedding_dim=self.output_dim,
+                    padding_idx=None,
+                )
+    def forward(self, x):
+        # x: (N, frame_len)
+        if self.n_bins == 0:
+            x = x.unsqueeze(-1)
+        else:
+            x = torch.bucketize(x, self.energy_bins)
+        return self.nn(x)
+class SingerEncoder(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.input_dim = 1
+        self.output_dim = self.cfg.output_singer_dim
+        self.nn = nn.Embedding(
+            num_embeddings=cfg.singer_table_size,
+            embedding_dim=self.output_dim,
+            padding_idx=None,
+        )
+    def forward(self, x):
+        # x: (N, 1) -> (N, 1, output_dim)
+        return self.nn(x)
+class ConditionEncoder(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.merge_mode = cfg.merge_mode
+        if cfg.use_whisper:
+            self.whisper_encoder = ContentEncoder(
+                self.cfg, self.cfg.whisper_dim, self.cfg.content_encoder_dim
+            )
+        if cfg.use_contentvec:
+            self.contentvec_encoder = ContentEncoder(
+                self.cfg, self.cfg.contentvec_dim, self.cfg.content_encoder_dim
+            )
+        if cfg.use_mert:
+            self.mert_encoder = ContentEncoder(
+                self.cfg, self.cfg.mert_dim, self.cfg.content_encoder_dim
+            )
+        if cfg.use_wenet:
+            self.wenet_encoder = ContentEncoder(
+                self.cfg, self.cfg.wenet_dim, self.cfg.content_encoder_dim
+            )
+        self.melody_encoder = MelodyEncoder(self.cfg)
+        self.loudness_encoder = LoudnessEncoder(self.cfg)
+        if cfg.use_spkid:
+            self.singer_encoder = SingerEncoder(self.cfg)
+    def forward(self, x):
+        outputs = []
+        if "frame_pitch" in x.keys():
+            if "frame_uv" not in x.keys():
+                x["frame_uv"] = None
+            pitch_enc_out = self.melody_encoder(
+                x["frame_pitch"], uv=x["frame_uv"], length=x["target_len"]
+            )
+            outputs.append(pitch_enc_out)
+        if "frame_energy" in x.keys():
+            loudness_enc_out = self.loudness_encoder(x["frame_energy"])
+            outputs.append(loudness_enc_out)
+        if "whisper_feat" in x.keys():
+            # whisper_feat: [b, T, 1024]
+            whiser_enc_out = self.whisper_encoder(
+                x["whisper_feat"], length=x["target_len"]
+            )
+            outputs.append(whiser_enc_out)
+            seq_len = whiser_enc_out.shape[1]
+        if "contentvec_feat" in x.keys():
+            contentvec_enc_out = self.contentvec_encoder(
+                x["contentvec_feat"], length=x["target_len"]
+            )
+            outputs.append(contentvec_enc_out)
+            seq_len = contentvec_enc_out.shape[1]
+        if "mert_feat" in x.keys():
+            mert_enc_out = self.mert_encoder(x["mert_feat"], length=x["target_len"])
+            outputs.append(mert_enc_out)
+            seq_len = mert_enc_out.shape[1]
+        if "wenet_feat" in x.keys():
+            wenet_enc_out = self.wenet_encoder(x["wenet_feat"], length=x["target_len"])
+            outputs.append(wenet_enc_out)
+            seq_len = wenet_enc_out.shape[1]
+        if "spk_id" in x.keys():
+            speaker_enc_out = self.singer_encoder(x["spk_id"])  # [b, 1, 384]
+            assert (
+                "whisper_feat" in x.keys()
+                or "contentvec_feat" in x.keys()
+                or "mert_feat" in x.keys()
+                or "wenet_feat" in x.keys()
+            )
+            singer_info = speaker_enc_out.expand(-1, seq_len, -1)
+            outputs.append(singer_info)
+        encoder_output = None
+        if self.merge_mode == "concat":
+            encoder_output = torch.cat(outputs, dim=-1)
+        if self.merge_mode == "add":
+            # (#modules, N, seq_len, output_dim)
+            outputs = torch.cat([out[None, :, :, :] for out in outputs], dim=0)
+            # (N, seq_len, output_dim)
+            encoder_output = torch.sum(outputs, dim=0)
+        return encoder_output

modules/encoder/conv_encoder.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import spectral_norm
+from modules.generic.conv import Conv1d
+class ConvEncoder(nn.Module):
+    def __init__(self, in_channels, z_channels, spk_channels, num_dilation_layer=10):
+        super(ConvEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.z_channels = z_channels
+        self.spk_channels = spk_channels
+        self.pre_process = Conv1d(in_channels, 512, kernel_size=3)
+        self.dilated_conv_layers = nn.ModuleList()
+        for i in range(num_dilation_layer):
+            dilation = 2**i
+            self.dilated_conv_layers.append(
+                DilatedConvBlock(512, 512, z_channels, spk_channels, dilation)
+            )
+    def forward(self, inputs, z, s):
+        inputs = inputs.transpose(1, 2)
+        outputs = self.pre_process(inputs)
+        print(inputs.shape)
+        for layer in self.dilated_conv_layers:
+            outputs = layer(outputs, z, s)
+        encoder_outputs = outputs.transpose(1, 2)
+        return encoder_outputs
+class DilatedConvBlock(nn.Module):
+    """A stack of dilated convolutions interspersed
+    with batch normalisation and ReLU activations"""
+    def __init__(self, in_channels, out_channels, z_channels, s_channels, dilation):
+        super(DilatedConvBlock, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.z_channels = z_channels
+        self.s_channels = s_channels
+        self.conv1d = Conv1d(
+            in_channels, out_channels, kernel_size=3, dilation=dilation
+        )
+        self.batch_layer = BatchNorm1dLayer(out_channels, s_channels, z_channels)
+    def forward(self, inputs, z, s):
+        outputs = self.conv1d(inputs)
+        outputs = self.batch_layer(outputs, z, s)
+        return F.relu(outputs)
+class BatchNorm1dLayer(nn.Module):
+    """The latents z and speaker embedding s modulate the scale and
+    shift parameters of the batch normalisation layers"""
+    def __init__(self, num_features, s_channels=128, z_channels=128):
+        super().__init__()
+        self.num_features = num_features
+        self.s_channels = s_channels
+        self.z_channels = z_channels
+        self.batch_nrom = nn.BatchNorm1d(num_features, affine=False)
+        self.scale_layer = spectral_norm(nn.Linear(z_channels, num_features))
+        self.scale_layer.weight.data.normal_(1, 0.02)  # Initialise scale at N(1, 0.02)
+        self.scale_layer.bias.data.zero_()  # Initialise bias at 0
+        self.shift_layer = spectral_norm(nn.Linear(s_channels, num_features))
+        self.shift_layer.weight.data.normal_(1, 0.02)  # Initialise scale at N(1, 0.02)
+        self.shift_layer.bias.data.zero_()  # Initialise bias at 0
+    def forward(self, inputs, z, s):
+        outputs = self.batch_nrom(inputs)
+        scale = self.scale_layer(z)
+        scale = scale.view(-1, self.num_features, 1)
+        shift = self.shift_layer(s)
+        shift = shift.view(-1, self.num_features, 1)
+        outputs = scale * outputs + shift
+        return outputs
+if __name__ == "__main__":
+    model = ConvEncoder(256, 64, 64)
+    encoder_inputs = torch.randn(2, 256, 10)
+    z = torch.randn(2, 64)
+    speaker = torch.randn(1, 64)
+    outputs, duration = model(encoder_inputs, z, speaker)
+    print(outputs.shape, duration.shape)

modules/encoder/position_encoder.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn as nn
+from modules.general.utils import Linear
+class PositionEncoder(nn.Module):
+    r"""Encoder of positional embedding, generates PE and then
+    feed into 2 full-connected layers with ``SiLU``.
+    Args:
+        d_raw_emb: The dimension of raw embedding vectors.
+        d_out: The dimension of output embedding vectors, default to ``d_raw_emb``.
+        d_mlp: The dimension of hidden layer in MLP, default to ``d_raw_emb`` * 4.
+        activation_function: The activation function used in MLP, default to ``SiLU``.
+        n_layer: The number of layers in MLP, default to 2.
+        max_period: controls the minimum frequency of the embeddings.
+    """
+    def __init__(
+        self,
+        d_raw_emb: int = 128,
+        d_out: int = None,
+        d_mlp: int = None,
+        activation_function: str = "SiLU",
+        n_layer: int = 2,
+        max_period: int = 10000,
+    ):
+        super().__init__()
+        self.d_raw_emb = d_raw_emb
+        self.d_out = d_raw_emb if d_out is None else d_out
+        self.d_mlp = d_raw_emb * 4 if d_mlp is None else d_mlp
+        self.n_layer = n_layer
+        self.max_period = max_period
+        if activation_function.lower() == "silu":
+            self.activation_function = "SiLU"
+        elif activation_function.lower() == "relu":
+            self.activation_function = "ReLU"
+        elif activation_function.lower() == "gelu":
+            self.activation_function = "GELU"
+        else:
+            raise ValueError("activation_function must be one of SiLU, ReLU, GELU")
+        self.activation_function = activation_function
+        tmp = [Linear(self.d_raw_emb, self.d_mlp), getattr(nn, activation_function)()]
+        for _ in range(self.n_layer - 1):
+            tmp.append(Linear(self.d_mlp, self.d_mlp))
+            tmp.append(getattr(nn, activation_function)())
+        tmp.append(Linear(self.d_mlp, self.d_out))
+        self.out = nn.Sequential(*tmp)
+    def forward(self, steps: torch.Tensor) -> torch.Tensor:
+        r"""Create and return sinusoidal timestep embeddings directly.
+        Args:
+            steps: a 1D Tensor of N indices, one per batch element.
+                These may be fractional.
+        Returns:
+            an [N x ``d_out``] Tensor of positional embeddings.
+        """
+        half = self.d_raw_emb // 2
+        freqs = torch.exp(
+            -math.log(self.max_period)
+            / half
+            * torch.arange(half, dtype=torch.float32, device=steps.device)
+        )
+        args = steps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if self.d_raw_emb % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+        return self.out(embedding)

modules/encoder/token_encoder.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/lifeiteng/vall-e
+import torch
+import torch.nn as nn
+class TokenEmbedding(nn.Module):
+    def __init__(self, dim_model: int, vocab_size: int, dropout: float = 0.0):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        self.word_embeddings = nn.Embedding(vocab_size, dim_model)
+    @property
+    def weight(self) -> torch.Tensor:
+        return self.word_embeddings.weight
+    def forward(self, x: torch.Tensor):
+        x = self.word_embeddings(x)
+        x = self.dropout(x)
+        return x

modules/flow/modules.py ADDED Viewed

	@@ -0,0 +1,457 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/jaywalnut310/vits/
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn import Conv1d
+from torch.nn.utils import weight_norm, remove_weight_norm
+from utils.util import *
+from modules.transformer.transforms import (
+    piecewise_rational_quadratic_transform,
+)
+from modules.base.base_module import LayerNorm
+LRELU_SLOPE = 0.1
+class DDSConv(nn.Module):
+    """
+    Dialted and Depth-Separable Convolution
+    """
+    def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
+        super().__init__()
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        self.drop = nn.Dropout(p_dropout)
+        self.convs_sep = nn.ModuleList()
+        self.convs_1x1 = nn.ModuleList()
+        self.norms_1 = nn.ModuleList()
+        self.norms_2 = nn.ModuleList()
+        for i in range(n_layers):
+            dilation = kernel_size**i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.convs_sep.append(
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    groups=channels,
+                    dilation=dilation,
+                    padding=padding,
+                )
+            )
+            self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+            self.norms_1.append(LayerNorm(channels))
+            self.norms_2.append(LayerNorm(channels))
+    def forward(self, x, x_mask, g=None):
+        if g is not None:
+            x = x + g
+        for i in range(self.n_layers):
+            y = self.convs_sep[i](x * x_mask)
+            y = self.norms_1[i](y)
+            y = F.gelu(y)
+            y = self.convs_1x1[i](y)
+            y = self.norms_2[i](y)
+            y = F.gelu(y)
+            y = self.drop(y)
+            x = x + y
+        return x * x_mask
+class WN(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+        p_dropout=0,
+    ):
+        super(WN, self).__init__()
+        assert kernel_size % 2 == 1
+        self.hidden_channels = hidden_channels
+        self.kernel_size = (kernel_size,)
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+        if gin_channels != 0:
+            cond_layer = torch.nn.Conv1d(
+                gin_channels, 2 * hidden_channels * n_layers, 1
+            )
+            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
+        for i in range(n_layers):
+            dilation = dilation_rate**i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = torch.nn.Conv1d(
+                hidden_channels,
+                2 * hidden_channels,
+                kernel_size,
+                dilation=dilation,
+                padding=padding,
+            )
+            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+    def forward(self, x, x_mask, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+        if g is not None:
+            g = self.cond_layer(g)
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            if g is not None:
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
+            else:
+                g_l = torch.zeros_like(x_in)
+            acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
+            acts = self.drop(acts)
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_channels, :]
+                x = (x + res_acts) * x_mask
+                output = output + res_skip_acts[:, self.hidden_channels :, :]
+            else:
+                output = output + res_skip_acts
+        return output * x_mask
+    def remove_weight_norm(self):
+        if self.gin_channels != 0:
+            torch.nn.utils.remove_weight_norm(self.cond_layer)
+        for l in self.in_layers:
+            torch.nn.utils.remove_weight_norm(l)
+        for l in self.res_skip_layers:
+            torch.nn.utils.remove_weight_norm(l)
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+        self.convs.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Log(nn.Module):
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+            logdet = torch.sum(-y, [1, 2])
+            return y, logdet
+        else:
+            x = torch.exp(x) * x_mask
+            return x
+class Flip(nn.Module):
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        if not reverse:
+            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+            return x, logdet
+        else:
+            return x
+class ElementwiseAffine(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.channels = channels
+        self.m = nn.Parameter(torch.zeros(channels, 1))
+        self.logs = nn.Parameter(torch.zeros(channels, 1))
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = self.m + torch.exp(self.logs) * x
+            y = y * x_mask
+            logdet = torch.sum(self.logs * x_mask, [1, 2])
+            return y, logdet
+        else:
+            x = (x - self.m) * torch.exp(-self.logs) * x_mask
+            return x
+class ResidualCouplingLayer(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        p_dropout=0,
+        gin_channels=0,
+        mean_only=False,
+    ):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            p_dropout=p_dropout,
+            gin_channels=gin_channels,
+        )
+        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0) * x_mask
+        h = self.enc(h, x_mask, g=g)
+        stats = self.post(h) * x_mask
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+        if not reverse:
+            x1 = m + x1 * torch.exp(logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            return x
+class ConvFlow(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        filter_channels,
+        kernel_size,
+        n_layers,
+        num_bins=10,
+        tail_bound=5.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.num_bins = num_bins
+        self.tail_bound = tail_bound
+        self.half_channels = in_channels // 2
+        self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
+        self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
+        self.proj = nn.Conv1d(
+            filter_channels, self.half_channels * (num_bins * 3 - 1), 1
+        )
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0)
+        h = self.convs(h, x_mask, g=g)
+        h = self.proj(h) * x_mask
+        b, c, t = x0.shape
+        h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]
+        unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
+        unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
+            self.filter_channels
+        )
+        unnormalized_derivatives = h[..., 2 * self.num_bins :]
+        x1, logabsdet = piecewise_rational_quadratic_transform(
+            x1,
+            unnormalized_widths,
+            unnormalized_heights,
+            unnormalized_derivatives,
+            inverse=reverse,
+            tails="linear",
+            tail_bound=self.tail_bound,
+        )
+        x = torch.cat([x0, x1], 1) * x_mask
+        logdet = torch.sum(logabsdet * x_mask, [1, 2])
+        if not reverse:
+            return x, logdet
+        else:
+            return x

modules/general/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .input_strategies import PromptedFeatures, PromptedPrecomputedFeatures
+from .scaling import BalancedDoubleSwish
+from .utils import Transpose

modules/general/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (339 Bytes). View file

modules/general/__pycache__/input_strategies.cpython-39.pyc ADDED Viewed

Binary file (5.64 kB). View file

modules/general/__pycache__/scaling.cpython-39.pyc ADDED Viewed

Binary file (39.7 kB). View file

modules/general/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (3.54 kB). View file

modules/general/input_strategies.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from
+# https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/data/input_strategies.py
+import random
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from typing import Tuple, Type
+from lhotse import CutSet
+from lhotse.dataset.collation import collate_features
+from lhotse.dataset.input_strategies import (
+    ExecutorType,
+    PrecomputedFeatures,
+    _get_executor,
+)
+from lhotse.utils import fastcopy
+class PromptedFeatures:
+    def __init__(self, prompts, features):
+        self.prompts = prompts
+        self.features = features
+    def to(self, device):
+        return PromptedFeatures(self.prompts.to(device), self.features.to(device))
+    def sum(self):
+        return self.features.sum()
+    @property
+    def ndim(self):
+        return self.features.ndim
+    @property
+    def data(self):
+        return (self.prompts, self.features)
+class PromptedPrecomputedFeatures(PrecomputedFeatures):
+    def __init__(
+        self,
+        dataset: str,
+        cuts: CutSet,
+        num_workers: int = 0,
+        executor_type: Type[ExecutorType] = ThreadPoolExecutor,
+    ) -> None:
+        super().__init__(num_workers, executor_type)
+        self.utt2neighbors = self._create_utt2neighbors(dataset, cuts)
+    def __call__(self, cuts: CutSet) -> Tuple[PromptedFeatures, PromptedFeatures]:
+        features, features_lens = self._collate_features(cuts)
+        prompts, prompts_lens = self._collate_prompts(cuts)
+        return PromptedFeatures(prompts, features), PromptedFeatures(
+            prompts_lens, features_lens
+        )
+    def _create_utt2neighbors(self, dataset, cuts):
+        utt2neighbors = defaultdict(lambda: [])
+        utt2cut = {cut.id: cut for cut in cuts}
+        if dataset.lower() == "libritts":
+            self._process_libritts_dataset(utt2neighbors, utt2cut, cuts)
+        elif dataset.lower() == "ljspeech":
+            self._process_ljspeech_dataset(utt2neighbors, utt2cut, cuts)
+        else:
+            raise ValueError("Unsupported dataset")
+        return utt2neighbors
+    def _process_libritts_dataset(self, utt2neighbors, utt2cut, cuts):
+        speaker2utts = defaultdict(lambda: [])
+        for cut in cuts:
+            speaker = cut.supervisions[0].speaker
+            speaker2utts[speaker].append(cut.id)
+        for spk, uttids in speaker2utts.items():
+            sorted_uttids = sorted(uttids)
+            if len(sorted_uttids) == 1:
+                utt2neighbors[sorted_uttids[0]].append(utt2cut[sorted_uttids[0]])
+                continue
+            utt2prevutt = dict(
+                zip(sorted_uttids, [sorted_uttids[1]] + sorted_uttids[:-1])
+            )
+            utt2postutt = dict(zip(sorted_uttids[:-1], sorted_uttids[1:]))
+            for utt in sorted_uttids:
+                if utt in utt2prevutt:
+                    utt2neighbors[utt].append(utt2cut[utt2prevutt[utt]])
+                if utt in utt2postutt:
+                    utt2neighbors[utt].append(utt2cut[utt2postutt[utt]])
+    def _process_ljspeech_dataset(self, utt2neighbors, utt2cut, cuts):
+        uttids = [cut.id for cut in cuts]
+        if len(uttids) == 1:
+            utt2neighbors[uttids[0]].append(utt2cut[uttids[0]])
+            return
+        utt2prevutt = dict(zip(uttids, [uttids[1]] + uttids[:-1]))
+        utt2postutt = dict(zip(uttids[:-1], uttids[1:]))
+        for utt in uttids:
+            prevutt, postutt = utt2prevutt.get(utt), utt2postutt.get(utt)
+            if prevutt and utt[:5] == prevutt[:5]:
+                utt2neighbors[utt].append(utt2cut[prevutt])
+            if postutt and utt[:5] == postutt[:5]:
+                utt2neighbors[utt].append(utt2cut[postutt])
+    def _collate_features(self, cuts):
+        return collate_features(
+            cuts,
+            executor=_get_executor(self.num_workers, executor_type=self._executor_type),
+        )
+    def _collate_prompts(self, cuts):
+        prompts_cuts = []
+        for k, cut in enumerate(cuts):
+            prompts_cut = random.choice(self.utt2neighbors[cut.id])
+            prompts_cuts.append(fastcopy(prompts_cut, id=f"{cut.id}-{str(k)}"))
+        mini_duration = min([cut.duration for cut in prompts_cuts] + [3.0])
+        prompts_cuts = CutSet(
+            cuts={k: cut for k, cut in enumerate(prompts_cuts)}
+        ).truncate(max_duration=mini_duration, offset_type="random", preserve_id=False)
+        return collate_features(
+            prompts_cuts,
+            executor=_get_executor(self.num_workers, executor_type=self._executor_type),
+        )

modules/general/scaling.py ADDED Viewed

	@@ -0,0 +1,1349 @@

+# This module is modified from https://github.com/Plachtaa/VALL-E-X/blob/3faaf8ccadb154d63b38070caf518ce9309ea0f4/modules/scaling.py
+import logging
+import random
+import math
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch import Tensor
+class Transpose(nn.Identity):
+    """(N, T, D) -> (N, D, T)"""
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return input.transpose(1, 2)
+class ActivationBalancerFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        scale_factor: Tensor,
+        sign_factor: Optional[Tensor],
+        channel_dim: int,
+    ) -> Tensor:
+        if channel_dim < 0:
+            channel_dim += x.ndim
+        ctx.channel_dim = channel_dim
+        xgt0 = x > 0
+        if sign_factor is None:
+            ctx.save_for_backward(xgt0, scale_factor)
+        else:
+            ctx.save_for_backward(xgt0, scale_factor, sign_factor)
+        return x
+    @staticmethod
+    def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None]:
+        if len(ctx.saved_tensors) == 3:
+            xgt0, scale_factor, sign_factor = ctx.saved_tensors
+            for _ in range(ctx.channel_dim, x_grad.ndim - 1):
+                scale_factor = scale_factor.unsqueeze(-1)
+                sign_factor = sign_factor.unsqueeze(-1)
+            factor = sign_factor + scale_factor * (xgt0.to(x_grad.dtype) - 0.5)
+        else:
+            xgt0, scale_factor = ctx.saved_tensors
+            for _ in range(ctx.channel_dim, x_grad.ndim - 1):
+                scale_factor = scale_factor.unsqueeze(-1)
+            factor = scale_factor * (xgt0.to(x_grad.dtype) - 0.5)
+        neg_delta_grad = x_grad.abs() * factor
+        return (
+            x_grad - neg_delta_grad,
+            None,
+            None,
+            None,
+        )
+def _compute_scale_factor(
+    x: Tensor,
+    channel_dim: int,
+    min_abs: float,
+    max_abs: float,
+    gain_factor: float,
+    max_factor: float,
+) -> Tensor:
+    if channel_dim < 0:
+        channel_dim += x.ndim
+    sum_dims = [d for d in range(x.ndim) if d != channel_dim]
+    x_abs_mean = torch.mean(x.abs(), dim=sum_dims).to(torch.float32)
+    if min_abs == 0.0:
+        below_threshold = 0.0
+    else:
+        # below_threshold is 0 if x_abs_mean > min_abs, can be at most max_factor if
+        # x_abs)_mean , min_abs.
+        below_threshold = ((min_abs - x_abs_mean) * (gain_factor / min_abs)).clamp(
+            min=0, max=max_factor
+        )
+    above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp(
+        min=0, max=max_factor
+    )
+    return below_threshold - above_threshold
+def _compute_sign_factor(
+    x: Tensor,
+    channel_dim: int,
+    min_positive: float,
+    max_positive: float,
+    gain_factor: float,
+    max_factor: float,
+) -> Tensor:
+    if channel_dim < 0:
+        channel_dim += x.ndim
+    sum_dims = [d for d in range(x.ndim) if d != channel_dim]
+    proportion_positive = torch.mean((x > 0).to(torch.float32), dim=sum_dims)
+    if min_positive == 0.0:
+        factor1 = 0.0
+    else:
+        # 0 if proportion_positive >= min_positive, else can be
+        # as large as max_factor.
+        factor1 = (
+            (min_positive - proportion_positive) * (gain_factor / min_positive)
+        ).clamp_(min=0, max=max_factor)
+    if max_positive == 1.0:
+        factor2 = 0.0
+    else:
+        # 0 if self.proportion_positive <= max_positive, else can be
+        # as large as -max_factor.
+        factor2 = (
+            (proportion_positive - max_positive) * (gain_factor / (1.0 - max_positive))
+        ).clamp_(min=0, max=max_factor)
+    sign_factor = factor1 - factor2
+    # require min_positive != 0 or max_positive != 1:
+    assert not isinstance(sign_factor, float)
+    return sign_factor
+class ActivationScaleBalancerFunction(torch.autograd.Function):
+    """
+    This object is used in class ActivationBalancer when the user specified
+    min_positive=0, max_positive=1, so there are no constraints on the signs
+    of the activations and only the absolute value has a constraint.
+    """
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        sign_factor: Tensor,
+        scale_factor: Tensor,
+        channel_dim: int,
+    ) -> Tensor:
+        if channel_dim < 0:
+            channel_dim += x.ndim
+        ctx.channel_dim = channel_dim
+        xgt0 = x > 0
+        ctx.save_for_backward(xgt0, sign_factor, scale_factor)
+        return x
+    @staticmethod
+    def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None]:
+        xgt0, sign_factor, scale_factor = ctx.saved_tensors
+        for _ in range(ctx.channel_dim, x_grad.ndim - 1):
+            sign_factor = sign_factor.unsqueeze(-1)
+            scale_factor = scale_factor.unsqueeze(-1)
+        factor = sign_factor + scale_factor * (xgt0.to(x_grad.dtype) - 0.5)
+        neg_delta_grad = x_grad.abs() * factor
+        return (
+            x_grad - neg_delta_grad,
+            None,
+            None,
+            None,
+        )
+class RandomClampFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        min: Optional[float],
+        max: Optional[float],
+        prob: float,
+        reflect: float,
+    ) -> Tensor:
+        x_clamped = torch.clamp(x, min=min, max=max)
+        mask = torch.rand_like(x) < prob
+        ans = torch.where(mask, x_clamped, x)
+        if x.requires_grad:
+            ctx.save_for_backward(ans == x)
+            ctx.reflect = reflect
+        if reflect != 0.0:
+            ans = ans * (1.0 + reflect) - (x * reflect)
+        return ans
+    @staticmethod
+    def backward(ctx, ans_grad: Tensor) -> Tuple[Tensor, None, None, None, None]:
+        (is_same,) = ctx.saved_tensors
+        x_grad = ans_grad * is_same.to(ans_grad.dtype)
+        reflect = ctx.reflect
+        if reflect != 0.0:
+            x_grad = x_grad * (1.0 + reflect) - (ans_grad * reflect)
+        return x_grad, None, None, None, None
+def random_clamp(
+    x: Tensor,
+    min: Optional[float] = None,
+    max: Optional[float] = None,
+    prob: float = 0.5,
+    reflect: float = 0.0,
+):
+    return RandomClampFunction.apply(x, min, max, prob, reflect)
+def random_cast_to_half(x: Tensor, min_abs: float = 5.0e-06) -> Tensor:
+    """
+    A randomized way of casting a floating point value to half precision.
+    """
+    if x.dtype == torch.float16:
+        return x
+    x_abs = x.abs()
+    is_too_small = x_abs < min_abs
+    # for elements where is_too_small is true, random_val will contain +-min_abs with
+    # probability (x.abs() / min_abs), and 0.0 otherwise.  [so this preserves expectations,
+    # for those elements].
+    random_val = min_abs * x.sign() * (torch.rand_like(x) * min_abs < x_abs)
+    return torch.where(is_too_small, random_val, x).to(torch.float16)
+class RandomGradFunction(torch.autograd.Function):
+    """
+    Does nothing in forward pass; in backward pass, gets rid of very small grads using
+    randomized approach that preserves expectations (intended to reduce roundoff).
+    """
+    @staticmethod
+    def forward(ctx, x: Tensor, min_abs: float) -> Tensor:
+        ctx.min_abs = min_abs
+        return x
+    @staticmethod
+    def backward(ctx, ans_grad: Tensor) -> Tuple[Tensor, None]:
+        if ans_grad.dtype == torch.float16:
+            return (
+                random_cast_to_half(ans_grad.to(torch.float32), min_abs=ctx.min_abs),
+                None,
+            )
+        else:
+            return ans_grad, None
+class RandomGrad(torch.nn.Module):
+    """
+    Gets rid of very small gradients using an expectation-preserving method, intended to increase
+    accuracy of training when using amp (automatic mixed precision)
+    """
+    def __init__(self, min_abs: float = 5.0e-06):
+        super(RandomGrad, self).__init__()
+        self.min_abs = min_abs
+    def forward(self, x: Tensor):
+        if torch.jit.is_scripting() or not self.training or torch.jit.is_tracing():
+            return x
+        else:
+            return RandomGradFunction.apply(x, self.min_abs)
+class SoftmaxFunction(torch.autograd.Function):
+    """
+    Tries to handle half-precision derivatives in a randomized way that should
+    be more accurate for training than the default behavior.
+    """
+    @staticmethod
+    def forward(ctx, x: Tensor, dim: int):
+        ans = x.softmax(dim=dim)
+        # if x dtype is float16, x.softmax() returns a float32 because
+        # (presumably) that op does not support float16, and autocast
+        # is enabled.
+        if torch.is_autocast_enabled():
+            ans = ans.to(torch.float16)
+        ctx.save_for_backward(ans)
+        ctx.x_dtype = x.dtype
+        ctx.dim = dim
+        return ans
+    @staticmethod
+    def backward(ctx, ans_grad: Tensor):
+        (ans,) = ctx.saved_tensors
+        with torch.cuda.amp.autocast(enabled=False):
+            ans_grad = ans_grad.to(torch.float32)
+            ans = ans.to(torch.float32)
+            x_grad = ans_grad * ans
+            x_grad = x_grad - ans * x_grad.sum(dim=ctx.dim, keepdim=True)
+            return x_grad, None
+def softmax(x: Tensor, dim: int):
+    if torch.jit.is_scripting() or torch.jit.is_tracing():
+        return x.softmax(dim)
+    return SoftmaxFunction.apply(x, dim)
+class MaxEigLimiterFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        coeffs: Tensor,
+        direction: Tensor,
+        channel_dim: int,
+        grad_scale: float,
+    ) -> Tensor:
+        ctx.channel_dim = channel_dim
+        ctx.grad_scale = grad_scale
+        ctx.save_for_backward(x.detach(), coeffs.detach(), direction.detach())
+        return x
+    @staticmethod
+    def backward(ctx, x_grad, *args):
+        with torch.enable_grad():
+            (x_orig, coeffs, new_direction) = ctx.saved_tensors
+            x_orig.requires_grad = True
+            num_channels = x_orig.shape[ctx.channel_dim]
+            x = x_orig.transpose(ctx.channel_dim, -1).reshape(-1, num_channels)
+            new_direction.requires_grad = False
+            x = x - x.mean(dim=0)
+            x_var = (x**2).mean()
+            x_residual = x - coeffs * new_direction
+            x_residual_var = (x_residual**2).mean()
+            # `variance_proportion` is the proportion of the variance accounted for
+            # by the top eigen-direction.  This is to be minimized.
+            variance_proportion = (x_var - x_residual_var) / (x_var + 1.0e-20)
+            variance_proportion.backward()
+        x_orig_grad = x_orig.grad
+        x_extra_grad = (
+            x_orig.grad
+            * ctx.grad_scale
+            * x_grad.norm()
+            / (x_orig_grad.norm() + 1.0e-20)
+        )
+        return x_grad + x_extra_grad.detach(), None, None, None, None
+class BasicNorm(torch.nn.Module):
+    """
+    This is intended to be a simpler, and hopefully cheaper, replacement for
+    LayerNorm.  The observation this is based on, is that Transformer-type
+    networks, especially with pre-norm, sometimes seem to set one of the
+    feature dimensions to a large constant value (e.g. 50), which "defeats"
+    the LayerNorm because the output magnitude is then not strongly dependent
+    on the other (useful) features.  Presumably the weight and bias of the
+    LayerNorm are required to allow it to do this.
+    So the idea is to introduce this large constant value as an explicit
+    parameter, that takes the role of the "eps" in LayerNorm, so the network
+    doesn't have to do this trick.  We make the "eps" learnable.
+    Args:
+       num_channels: the number of channels, e.g. 512.
+      channel_dim: the axis/dimension corresponding to the channel,
+        interprted as an offset from the input's ndim if negative.
+        shis is NOT the num_channels; it should typically be one of
+        {-2, -1, 0, 1, 2, 3}.
+       eps: the initial "epsilon" that we add as ballast in:
+             scale = ((input_vec**2).mean() + epsilon)**-0.5
+          Note: our epsilon is actually large, but we keep the name
+          to indicate the connection with conventional LayerNorm.
+       learn_eps: if true, we learn epsilon; if false, we keep it
+         at the initial value.
+    eps_min: float
+    eps_max: float
+    """
+    def __init__(
+        self,
+        num_channels: int,
+        channel_dim: int = -1,  # CAUTION: see documentation.
+        eps: float = 0.25,
+        learn_eps: bool = True,
+        eps_min: float = -3.0,
+        eps_max: float = 3.0,
+    ) -> None:
+        super(BasicNorm, self).__init__()
+        self.num_channels = num_channels
+        self.channel_dim = channel_dim
+        if learn_eps:
+            self.eps = nn.Parameter(torch.tensor(eps).log().detach())
+        else:
+            self.register_buffer("eps", torch.tensor(eps).log().detach())
+        self.eps_min = eps_min
+        self.eps_max = eps_max
+    def forward(self, x: Tensor) -> Tensor:
+        assert x.shape[self.channel_dim] == self.num_channels
+        eps = self.eps
+        if self.training and random.random() < 0.25:
+            # with probability 0.25, in training mode, clamp eps between the min
+            # and max; this will encourage it to learn parameters within the
+            # allowed range by making parameters that are outside the allowed
+            # range noisy.
+            # gradients to allow the parameter to get back into the allowed
+            # region if it happens to exit it.
+            eps = eps.clamp(min=self.eps_min, max=self.eps_max)
+        scales = (
+            torch.mean(x**2, dim=self.channel_dim, keepdim=True) + eps.exp()
+        ) ** -0.5
+        return x * scales
+def ScaledLinear(*args, initial_scale: float = 1.0, **kwargs) -> nn.Linear:
+    """
+    Behaves like a constructor of a modified version of nn.Linear
+    that gives an easy way to set the default initial parameter scale.
+    Args:
+        Accepts the standard args and kwargs that nn.Linear accepts
+        e.g. in_features, out_features, bias=False.
+        initial_scale: you can override this if you want to increase
+           or decrease the initial magnitude of the module's output
+           (affects the initialization of weight_scale and bias_scale).
+           Another option, if you want to do something like this, is
+           to re-initialize the parameters.
+    """
+    ans = nn.Linear(*args, **kwargs)
+    with torch.no_grad():
+        ans.weight[:] *= initial_scale
+        if ans.bias is not None:
+            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
+    return ans
+def ScaledConv1d(
+    *args,
+    initial_scale: float = 1.0,
+    kernel_size: int = 3,
+    padding: str = "same",
+    **kwargs,
+) -> nn.Conv1d:
+    """
+    Behaves like a constructor of a modified version of nn.Conv1d
+    that gives an easy way to set the default initial parameter scale.
+    Args:
+        Accepts the standard args and kwargs that nn.Linear accepts
+        e.g. in_features, out_features, bias=False.
+        initial_scale: you can override this if you want to increase
+           or decrease the initial magnitude of the module's output
+           (affects the initialization of weight_scale and bias_scale).
+           Another option, if you want to do something like this, is
+           to re-initialize the parameters.
+    """
+    ans = nn.Conv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs)
+    with torch.no_grad():
+        ans.weight[:] *= initial_scale
+        if ans.bias is not None:
+            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
+    return ans
+def TransposeScaledConv1d(
+    *args,
+    initial_scale: float = 1.0,
+    kernel_size: int = 3,
+    padding: str = "same",
+    **kwargs,
+) -> nn.Sequential:
+    """
+    Transpose -> ScaledConv1d
+    """
+    return nn.Sequential(
+        Transpose(),
+        ScaledConv1d(
+            *args,
+            initial_scale=initial_scale,
+            kernel_size=kernel_size,
+            padding=padding,
+            **kwargs,
+        ),
+    )
+def ScaledConv1dTranspose(
+    *args,
+    initial_scale: float = 1.0,
+    kernel_size: int = 3,
+    padding: str = "same",
+    **kwargs,
+) -> nn.Sequential:
+    """
+    Transpose -> ScaledConv1d
+    """
+    return nn.Sequential(
+        ScaledConv1d(
+            *args,
+            initial_scale=initial_scale,
+            kernel_size=kernel_size,
+            padding=padding,
+            **kwargs,
+        ),
+        Transpose(),
+    )
+def TransposeConv1d(
+    *args, kernel_size: int = 3, padding: str = "same", **kwargs
+) -> nn.Sequential:
+    """
+    Transpose -> Conv1d
+    """
+    return nn.Sequential(
+        Transpose(),
+        nn.Conv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs),
+    )
+def Conv1dTranspose(
+    *args, kernel_size: int = 3, padding: str = "same", **kwargs
+) -> nn.Sequential:
+    """
+    ScaledConv1d -> Transpose
+    """
+    return nn.Sequential(
+        nn.Conv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs),
+        Transpose(),
+    )
+class SRLinear(nn.Linear):
+    """https://arxiv.org/abs/2303.06296
+    Stabilizing Transformer Training by Preventing Attention Entropy Collapse
+    """
+    def __init__(self, in_features, out_features, bias=True, **kwargs):
+        super().__init__(in_features, out_features, bias=bias, **kwargs)
+        self.register_buffer(
+            "u", nn.functional.normalize(torch.randn(in_features), dim=0)
+        )
+        with torch.no_grad():
+            sigma = self.get_sigma()
+        self.register_buffer("spectral_norm", sigma)
+        self.sigma = nn.Parameter(torch.ones(1))
+    def get_sigma(self):
+        with torch.no_grad():
+            u = self.u
+            v = self.weight.mv(u)
+            v = nn.functional.normalize(v, dim=0)
+            u = self.weight.T.mv(v)
+            u = nn.functional.normalize(u, dim=0)
+            self.u.data.copy_(u)
+        return torch.einsum("c,cd,d->", v, self.weight, u)
+    def get_weight(self):
+        sigma = self.get_sigma()
+        if self.training:
+            self.spectral_norm.data.copy_(sigma)
+        weight = (self.sigma / sigma) * self.weight
+        return weight
+    def forward(self, x):
+        return nn.functional.linear(x, self.get_weight(), self.bias)
+class SRConv1d(SRLinear):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        kernel_size,
+        stride: int = 1,
+        padding: str = "same",
+        bias: bool = True,
+        **kwargs,
+    ):
+        in_features = in_features * kernel_size
+        super().__init__(in_features, out_features, bias=bias, **kwargs)
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+    def forward(self, x):
+        in_features = self.in_features // self.kernel_size
+        weight = self.get_weight().view(
+            self.out_features, in_features, self.kernel_size
+        )
+        return nn.functional.conv1d(
+            x, weight, bias=self.bias, stride=self.stride, padding=self.padding
+        )
+def TransposeSRConv1d(
+    *args, kernel_size: int = 3, padding: str = "same", **kwargs
+) -> nn.Sequential:
+    """
+    Transpose -> SRConv1d
+    """
+    return nn.Sequential(
+        Transpose(),
+        SRConv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs),
+    )
+def SRConv1dTranspose(
+    *args, kernel_size: int = 3, padding: str = "same", **kwargs
+) -> nn.Sequential:
+    """
+    SRConv1d -> Transpose
+    """
+    return nn.Sequential(
+        SRConv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs),
+        Transpose(),
+    )
+class ActivationBalancer(torch.nn.Module):
+    """
+    Modifies the backpropped derivatives of a function to try to encourage, for
+    each channel, that it is positive at least a proportion `threshold` of the
+    time.  It does this by multiplying negative derivative values by up to
+    (1+max_factor), and positive derivative values by up to (1-max_factor),
+    interpolated from 1 at the threshold to those extremal values when none
+    of the inputs are positive.
+    Args:
+           num_channels: the number of channels
+           channel_dim: the dimension/axis corresponding to the channel, e.g.
+               -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
+           min_positive: the minimum, per channel, of the proportion of the time
+               that (x > 0), below which we start to modify the derivatives.
+           max_positive: the maximum, per channel, of the proportion of the time
+               that (x > 0), above which we start to modify the derivatives.
+           max_factor: the maximum factor by which we modify the derivatives for
+              either the sign constraint or the magnitude constraint;
+              e.g. with max_factor=0.02, the the derivatives would be multiplied by
+              values in the range [0.98..1.02].
+           sign_gain_factor: determines the 'gain' with which we increase the
+              change in gradient once the constraints on min_positive and max_positive
+              are violated.
+           scale_gain_factor: determines the 'gain' with which we increase the
+              change in gradient once the constraints on min_abs and max_abs
+              are violated.
+           min_abs:  the minimum average-absolute-value difference from the mean
+               value per channel, which we allow, before we start to modify
+               the derivatives to prevent this.
+           max_abs:  the maximum average-absolute-value difference from the mean
+               value per channel, which we allow, before we start to modify
+               the derivatives to prevent this.
+          min_prob: determines the minimum probability with which we modify the
+             gradients for the {min,max}_positive and {min,max}_abs constraints,
+             on each forward().  This is done randomly to prevent all layers
+             from doing it at the same time.  Early in training we may use
+             higher probabilities than this; it will decay to this value.
+    """
+    def __init__(
+        self,
+        num_channels: int,
+        channel_dim: int,
+        min_positive: float = 0.05,
+        max_positive: float = 0.95,
+        max_factor: float = 0.04,
+        sign_gain_factor: float = 0.01,
+        scale_gain_factor: float = 0.02,
+        min_abs: float = 0.2,
+        max_abs: float = 100.0,
+        min_prob: float = 0.1,
+    ):
+        super(ActivationBalancer, self).__init__()
+        self.num_channels = num_channels
+        self.channel_dim = channel_dim
+        self.min_positive = min_positive
+        self.max_positive = max_positive
+        self.max_factor = max_factor
+        self.min_abs = min_abs
+        self.max_abs = max_abs
+        self.min_prob = min_prob
+        self.sign_gain_factor = sign_gain_factor
+        self.scale_gain_factor = scale_gain_factor
+        # count measures how many times the forward() function has been called.
+        # We occasionally sync this to a tensor called `count`, that exists to
+        # make sure it is synced to disk when we load and save the model.
+        self.cpu_count = 0
+        self.register_buffer("count", torch.tensor(0, dtype=torch.int64))
+    def forward(self, x: Tensor) -> Tensor:
+        if torch.jit.is_scripting() or not x.requires_grad or torch.jit.is_tracing():
+            return _no_op(x)
+        count = self.cpu_count
+        self.cpu_count += 1
+        if random.random() < 0.01:
+            # Occasionally sync self.cpu_count with self.count.
+            # count affects the decay of 'prob'.  don't do this on every iter,
+            # because syncing with the GPU is slow.
+            self.cpu_count = max(self.cpu_count, self.count.item())
+            self.count.fill_(self.cpu_count)
+        # the prob of doing some work exponentially decreases from 0.5 till it hits
+        # a floor at min_prob (==0.1, by default)
+        prob = max(self.min_prob, 0.5 ** (1 + (count / 4000.0)))
+        if random.random() < prob:
+            sign_gain_factor = 0.5
+            if self.min_positive != 0.0 or self.max_positive != 1.0:
+                sign_factor = _compute_sign_factor(
+                    x,
+                    self.channel_dim,
+                    self.min_positive,
+                    self.max_positive,
+                    gain_factor=self.sign_gain_factor / prob,
+                    max_factor=self.max_factor,
+                )
+            else:
+                sign_factor = None
+            scale_factor = _compute_scale_factor(
+                x.detach(),
+                self.channel_dim,
+                min_abs=self.min_abs,
+                max_abs=self.max_abs,
+                gain_factor=self.scale_gain_factor / prob,
+                max_factor=self.max_factor,
+            )
+            return ActivationBalancerFunction.apply(
+                x,
+                scale_factor,
+                sign_factor,
+                self.channel_dim,
+            )
+        else:
+            return _no_op(x)
+def penalize_abs_values_gt(x: Tensor, limit: float, penalty: float) -> Tensor:
+    """
+    Returns x unmodified, but in backprop will put a penalty for the excess of
+    the absolute values of elements of x over the limit "limit".  E.g. if
+    limit == 10.0, then if x has any values over 10 it will get a penalty.
+    Caution: the value of this penalty will be affected by grad scaling used
+    in automatic mixed precision training.  For this reasons we use this,
+    it shouldn't really matter, or may even be helpful; we just use this
+    to disallow really implausible values of scores to be given to softmax.
+    """
+    x_sign = x.sign()
+    over_limit = (x.abs() - limit) > 0
+    # The following is a memory efficient way to penalize the absolute values of
+    # x that's over the limit.  (The memory efficiency comes when you think
+    # about which items torch needs to cache for the autograd, and which ones it
+    # can throw away).  The numerical value of aux_loss as computed here will
+    # actually be larger than it should be, by limit * over_limit.sum(), but it
+    # has the same derivative as the real aux_loss which is penalty * (x.abs() -
+    # limit).relu().
+    aux_loss = penalty * ((x_sign * over_limit).to(torch.int8) * x)
+    # note: we don't do sum() here on aux)_loss, but it's as if we had done
+    # sum() due to how with_loss() works.
+    x = with_loss(x, aux_loss)
+    # you must use x for something, or this will be ineffective.
+    return x
+def _diag(x: Tensor):  # like .diag(), but works for tensors with 3 dims.
+    if x.ndim == 2:
+        return x.diag()
+    else:
+        (batch, dim, dim) = x.shape
+        x = x.reshape(batch, dim * dim)
+        x = x[:, :: dim + 1]
+        assert x.shape == (batch, dim)
+        return x
+def _whitening_metric(x: Tensor, num_groups: int):
+    """
+    Computes the "whitening metric", a value which will be 1.0 if all the eigenvalues of
+    of the centered feature covariance are the same within each group's covariance matrix
+    and also between groups.
+    Args:
+        x: a Tensor of shape (*, num_channels)
+     num_groups:  the number of groups of channels, a number >=1 that divides num_channels
+    Returns:
+        Returns a scalar Tensor that will be 1.0 if the data is "perfectly white" and
+    greater than 1.0 otherwise.
+    """
+    assert x.dtype != torch.float16
+    x = x.reshape(-1, x.shape[-1])
+    (num_frames, num_channels) = x.shape
+    assert num_channels % num_groups == 0
+    channels_per_group = num_channels // num_groups
+    x = x.reshape(num_frames, num_groups, channels_per_group).transpose(0, 1)
+    # x now has shape (num_groups, num_frames, channels_per_group)
+    # subtract the mean so we use the centered, not uncentered, covariance.
+    # My experience has been that when we "mess with the gradients" like this,
+    # it's better not do anything that tries to move the mean around, because
+    # that can easily cause instability.
+    x = x - x.mean(dim=1, keepdim=True)
+    # x_covar: (num_groups, channels_per_group, channels_per_group)
+    x_covar = torch.matmul(x.transpose(1, 2), x)
+    x_covar_mean_diag = _diag(x_covar).mean()
+    # the following expression is what we'd get if we took the matrix product
+    # of each covariance and measured the mean of its trace, i.e.
+    # the same as _diag(torch.matmul(x_covar, x_covar)).mean().
+    x_covarsq_mean_diag = (x_covar**2).sum() / (num_groups * channels_per_group)
+    # this metric will be >= 1.0; the larger it is, the less 'white' the data was.
+    metric = x_covarsq_mean_diag / (x_covar_mean_diag**2 + 1.0e-20)
+    return metric
+class WhiteningPenaltyFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        num_groups: int,
+        whitening_limit: float,
+        grad_scale: float,
+    ) -> Tensor:
+        ctx.save_for_backward(x)
+        ctx.num_groups = num_groups
+        ctx.whitening_limit = whitening_limit
+        ctx.grad_scale = grad_scale
+        return x
+    @staticmethod
+    def backward(ctx, x_grad: Tensor):
+        (x_orig,) = ctx.saved_tensors
+        with torch.enable_grad():
+            with torch.cuda.amp.autocast(enabled=False):
+                x_detached = x_orig.to(torch.float32).detach()
+                x_detached.requires_grad = True
+                metric = _whitening_metric(x_detached, ctx.num_groups)
+                if random.random() < 0.005 or __name__ == "__main__":
+                    logging.info(
+                        f"Whitening: num_groups={ctx.num_groups}, num_channels={x_orig.shape[-1]}, "
+                        f"metric={metric.item():.2f} vs. limit={ctx.whitening_limit}"
+                    )
+                (metric - ctx.whitening_limit).relu().backward()
+                penalty_grad = x_detached.grad
+                scale = ctx.grad_scale * (
+                    x_grad.to(torch.float32).norm() / (penalty_grad.norm() + 1.0e-20)
+                )
+                penalty_grad = penalty_grad * scale
+        return x_grad + penalty_grad.to(x_grad.dtype), None, None, None
+class Whiten(nn.Module):
+    def __init__(
+        self,
+        num_groups: int,
+        whitening_limit: float,
+        prob: Union[float, Tuple[float, float]],
+        grad_scale: float,
+    ):
+        """
+        Args:
+          num_groups: the number of groups to divide the channel dim into before
+            whitening.  We will attempt to make the feature covariance
+            within each group, after mean subtraction, as "white" as possible,
+            while having the same trace across all groups.
+         whitening_limit: a value greater than 1.0, that dictates how much
+           freedom we have to violate the constraints.  1.0 would mean perfectly
+           white, with exactly the same trace across groups; larger values
+           give more freedom.  E.g. 2.0.
+         prob: the probability with which we apply the gradient modification
+           (also affects the grad scale).  May be supplied as a float,
+           or as a pair (min_prob, max_prob)
+          grad_scale: determines the scale on the gradient term from this object,
+            relative to the rest of the gradient on the attention weights.
+            E.g. 0.02 (you may want to use smaller values than this if prob is large)
+        """
+        super(Whiten, self).__init__()
+        assert num_groups >= 1
+        assert whitening_limit >= 1
+        assert grad_scale >= 0
+        self.num_groups = num_groups
+        self.whitening_limit = whitening_limit
+        if isinstance(prob, float):
+            assert 0 < prob <= 1
+            self.prob = prob
+        else:
+            (self.min_prob, self.max_prob) = prob
+            assert 0 < self.min_prob < self.max_prob <= 1
+            self.prob = self.max_prob
+        self.grad_scale = grad_scale
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        In the forward pass, this function just returns the input unmodified.
+        In the backward pass, it will modify the gradients to ensure that the
+        distribution in each group has close to (lambda times I) as the covariance
+        after mean subtraction, with the same lambda across groups.
+        For whitening_limit > 1, there will be more freedom to violate this
+        constraint.
+        Args:
+           x: the input of shape (*, num_channels)
+        Returns:
+            x, unmodified.   You should make sure
+        you use the returned value, or the graph will be freed
+        and nothing will happen in backprop.
+        """
+        if not x.requires_grad or random.random() > self.prob or self.grad_scale == 0:
+            return _no_op(x)
+        else:
+            if hasattr(self, "min_prob") and random.random() < 0.25:
+                # occasionally switch between min_prob and max_prob, based on whether
+                # we are above or below the threshold.
+                if (
+                    _whitening_metric(x.to(torch.float32), self.num_groups)
+                    > self.whitening_limit
+                ):
+                    # there would be a change to the grad.
+                    self.prob = self.max_prob
+                else:
+                    self.prob = self.min_prob
+            return WhiteningPenaltyFunction.apply(
+                x, self.num_groups, self.whitening_limit, self.grad_scale
+            )
+class WithLoss(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: Tensor, y: Tensor):
+        ctx.y_shape = y.shape
+        return x
+    @staticmethod
+    def backward(ctx, ans_grad: Tensor):
+        return ans_grad, torch.ones(
+            ctx.y_shape, dtype=ans_grad.dtype, device=ans_grad.device
+        )
+def with_loss(x, y):
+    if torch.jit.is_scripting() or torch.jit.is_tracing():
+        return x
+    # returns x but adds y.sum() to the loss function.
+    return WithLoss.apply(x, y)
+def _no_op(x: Tensor) -> Tensor:
+    if torch.jit.is_scripting() or torch.jit.is_tracing():
+        return x
+    else:
+        # a no-op function that will have a node in the autograd graph,
+        # to avoid certain bugs relating to backward hooks
+        return x.chunk(1, dim=-1)[0]
+class Identity(torch.nn.Module):
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, x):
+        return _no_op(x)
+class MaxEig(torch.nn.Module):
+    """
+    Modifies the backpropped derivatives of a function to try to discourage
+    that any given direction in activation space accounts for more than
+    a specified proportion of the covariance (e.g. 0.2).
+    Args:
+           num_channels: the number of channels
+           channel_dim: the dimension/axis corresponding to the channel, e.g.
+               -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
+           max_var_per_eig:  the maximum proportion of the variance of the
+               features/channels, after mean subtraction, that can come from
+               any given eigenvalue.
+           min_prob: the minimum probability with which we apply this during any invocation
+               of forward(), assuming last time we applied the constraint it was
+               not active; supplied for speed.
+           scale: determines the scale with which we modify the gradients, relative
+               to the existing / unmodified gradients
+    """
+    def __init__(
+        self,
+        num_channels: int,
+        channel_dim: int,
+        max_var_per_eig: float = 0.2,
+        min_prob: float = 0.01,
+        scale: float = 0.01,
+    ):
+        super(MaxEig, self).__init__()
+        self.num_channels = num_channels
+        self.channel_dim = channel_dim
+        self.scale = scale
+        assert max_var_per_eig == 0.0 or max_var_per_eig > 1.0 / num_channels
+        self.max_var_per_eig = max_var_per_eig
+        # we figure out the dominant direction using the power method: starting with
+        # a random vector, keep multiplying by the covariance and renormalizing.
+        with torch.no_grad():
+            # arbitrary.. would use randn() but want to leave the rest of the model's
+            # random parameters unchanged for comparison
+            direction = torch.arange(num_channels).to(torch.float)
+            direction = direction / direction.norm()
+            self.register_buffer("max_eig_direction", direction)
+        self.min_prob = min_prob
+        # cur_prob is the current probability we'll use to apply the ActivationBalancer.
+        # We'll regress this towards prob, each tiem we try to apply it and it is not
+        # active.
+        self.cur_prob = 1.0
+    def forward(self, x: Tensor) -> Tensor:
+        if (
+            torch.jit.is_scripting()
+            or self.max_var_per_eig <= 0
+            or random.random() > self.cur_prob
+            or torch.jit.is_tracing()
+        ):
+            return _no_op(x)
+        with torch.cuda.amp.autocast(enabled=False):
+            eps = 1.0e-20
+            orig_x = x
+            x = x.to(torch.float32)
+            with torch.no_grad():
+                x = x.transpose(self.channel_dim, -1).reshape(-1, self.num_channels)
+                x = x - x.mean(dim=0)
+                new_direction, coeffs = self._find_direction_coeffs(
+                    x, self.max_eig_direction
+                )
+                x_var = (x**2).mean()
+                x_residual = x - coeffs * new_direction
+                x_residual_var = (x_residual**2).mean()
+                # `variance_proportion` is the proportion of the variance accounted for
+                # by the top eigen-direction.
+                variance_proportion = (x_var - x_residual_var) / (x_var + 1.0e-20)
+                # ensure new direction is nonzero even if x == 0, by including `direction`.
+                self._set_direction(0.1 * self.max_eig_direction + new_direction)
+            if random.random() < 0.01 or __name__ == "__main__":
+                logging.info(
+                    f"variance_proportion = {variance_proportion.item()}, shape={tuple(orig_x.shape)}, cur_prob={self.cur_prob}"
+                )
+            if variance_proportion >= self.max_var_per_eig:
+                # The constraint is active.  Note, we should quite rarely
+                # reach here, only near the beginning of training if we are
+                # starting to diverge, should this constraint be active.
+                cur_prob = self.cur_prob
+                self.cur_prob = 1.0  # next time, do the update with probability 1.0.
+                return MaxEigLimiterFunction.apply(
+                    orig_x, coeffs, new_direction, self.channel_dim, self.scale
+                )
+            else:
+                # let self.cur_prob exponentially approach self.min_prob, as
+                # long as the constraint is inactive.
+                self.cur_prob = 0.75 * self.cur_prob + 0.25 * self.min_prob
+                return orig_x
+    def _set_direction(self, direction: Tensor):
+        """
+        Sets self.max_eig_direction to a normalized version of `direction`
+        """
+        direction = direction.detach()
+        direction = direction / direction.norm()
+        direction_sum = direction.sum().item()
+        if direction_sum - direction_sum == 0:  # no inf/nan
+            self.max_eig_direction[:] = direction
+        else:
+            logging.info(
+                f"Warning: sum of direction in MaxEig is {direction_sum}, "
+                "num_channels={self.num_channels}, channel_dim={self.channel_dim}"
+            )
+    def _find_direction_coeffs(
+        self, x: Tensor, prev_direction: Tensor
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+            Figure out (an approximation to) the proportion of the variance of a set of
+            feature vectors that can be attributed to the top eigen-direction.
+            Args:
+             x: a Tensor of shape (num_frames, num_channels), with num_frames > 1.
+          prev_direction:  a Tensor of shape (num_channels,), that is our previous estimate
+                   of the top eigen-direction, or a random direction if this is the first
+                   iteration.  Does not have to be normalized, but should be nonzero.
+        Returns: (cur_direction, coeffs), where:
+             cur_direction: a Tensor of shape (num_channels,) that is the current
+                estimate of the top eigen-direction.
+             coeffs: a Tensor of shape (num_frames, 1) that minimizes, or
+                approximately minimizes, (x - coeffs * cur_direction).norm()
+        """
+        (num_frames, num_channels) = x.shape
+        assert num_channels > 1 and num_frames > 1
+        assert prev_direction.shape == (num_channels,)
+        # `coeffs` are the coefficients of `prev_direction` in x.
+        # actually represent the coeffs up to a constant positive factor.
+        coeffs = (x * prev_direction).sum(dim=1, keepdim=True) + 1.0e-10
+        cur_direction = (x * coeffs).sum(dim=0) / ((coeffs**2).sum() + 1.0e-20)
+        return cur_direction, coeffs
+class DoubleSwishFunction(torch.autograd.Function):
+    """
+      double_swish(x) = x * torch.sigmoid(x-1)
+    This is a definition, originally motivated by its close numerical
+    similarity to swish(swish(x)), where swish(x) =  x * sigmoid(x).
+    Memory-efficient derivative computation:
+     double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1)
+     double_swish'(x) = d/dx double_swish(x) =  x * s'(x) + x' * s(x) = x * s'(x) + s(x).
+     Now, s'(x) = s(x) * (1-s(x)).
+     double_swish'(x) =  x * s'(x) + s(x).
+                      =  x * s(x) * (1-s(x)) + s(x).
+                     = double_swish(x) * (1-s(x)) + s(x)
+     ... so we just need to remember s(x) but not x itself.
+    """
+    @staticmethod
+    def forward(ctx, x: Tensor) -> Tensor:
+        requires_grad = x.requires_grad
+        x_dtype = x.dtype
+        if x.dtype == torch.float16:
+            x = x.to(torch.float32)
+        s = torch.sigmoid(x - 1.0)
+        y = x * s
+        if requires_grad:
+            deriv = y * (1 - s) + s
+            # notes on derivative of x * sigmoid(x - 1):
+            # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29
+            # min \simeq -0.043638.  Take floor as -0.043637 so it's a lower bund
+            # max \simeq 1.1990.   Take ceil to be 1.2 so it's an upper bound.
+            # the combination of "+ torch.rand_like(deriv)" and casting to torch.uint8 (which
+            # floors), should be expectation-preserving.
+            floor = -0.043637
+            ceil = 1.2
+            d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
+                deriv
+            )
+            if __name__ == "__main__":
+                # for self-testing only.
+                assert d_scaled.min() >= 0.0
+                assert d_scaled.max() < 256.0
+            d_int = d_scaled.to(torch.uint8)
+            ctx.save_for_backward(d_int)
+        if x.dtype == torch.float16 or torch.is_autocast_enabled():
+            y = y.to(torch.float16)
+        return y
+    @staticmethod
+    def backward(ctx, y_grad: Tensor) -> Tensor:
+        (d,) = ctx.saved_tensors
+        # the same constants as used in forward pass.
+        floor = -0.043637
+        ceil = 1.2
+        d = d * ((ceil - floor) / 255.0) + floor
+        return y_grad * d
+class DoubleSwish(torch.nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        """Return double-swish activation function which is an approximation to Swish(Swish(x)),
+        that we approximate closely with x * sigmoid(x-1).
+        """
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            return x * torch.sigmoid(x - 1.0)
+        return DoubleSwishFunction.apply(x)
+def BalancedDoubleSwish(
+    d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25
+) -> nn.Sequential:
+    """
+    ActivationBalancer -> DoubleSwish
+    """
+    balancer = ActivationBalancer(
+        d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob
+    )
+    return nn.Sequential(
+        balancer,
+        DoubleSwish(),
+    )
+def _test_max_eig():
+    for proportion in [0.1, 0.5, 10.0]:
+        logging.info(f"proportion = {proportion}")
+        x = torch.randn(100, 128)
+        direction = torch.randn(128)
+        coeffs = torch.randn(100, 1)
+        x += proportion * direction * coeffs
+        x.requires_grad = True
+        num_channels = 128
+        m = MaxEig(
+            num_channels, 1, 0.5, scale=0.1  # channel_dim  # max_var_per_eig
+        )  # grad_scale
+        for _ in range(4):
+            y = m(x)
+        y_grad = torch.randn_like(x)
+        y.backward(gradient=y_grad)
+        if proportion < 0.2:
+            assert torch.allclose(x.grad, y_grad, atol=1.0e-02)
+        elif proportion > 1.0:
+            assert not torch.allclose(x.grad, y_grad)
+def _test_whiten():
+    for proportion in [0.1, 0.5, 10.0]:
+        logging.info(f"_test_whiten(): proportion = {proportion}")
+        x = torch.randn(100, 128)
+        direction = torch.randn(128)
+        coeffs = torch.randn(100, 1)
+        x += proportion * direction * coeffs
+        x.requires_grad = True
+        num_channels = 128
+        m = Whiten(
+            1, 5.0, prob=1.0, grad_scale=0.1  # num_groups  # whitening_limit,
+        )  # grad_scale
+        for _ in range(4):
+            y = m(x)
+        y_grad = torch.randn_like(x)
+        y.backward(gradient=y_grad)
+        if proportion < 0.2:
+            assert torch.allclose(x.grad, y_grad)
+        elif proportion > 1.0:
+            assert not torch.allclose(x.grad, y_grad)
+def _test_activation_balancer_sign():
+    probs = torch.arange(0, 1, 0.01)
+    N = 1000
+    x = 1.0 * ((2.0 * (torch.rand(probs.numel(), N) < probs.unsqueeze(-1))) - 1.0)
+    x = x.detach()
+    x.requires_grad = True
+    m = ActivationBalancer(
+        probs.numel(),
+        channel_dim=0,
+        min_positive=0.05,
+        max_positive=0.95,
+        max_factor=0.2,
+        min_abs=0.0,
+    )
+    y_grad = torch.sign(torch.randn(probs.numel(), N))
+    y = m(x)
+    y.backward(gradient=y_grad)
+    print("_test_activation_balancer_sign: x = ", x)
+    print("_test_activation_balancer_sign: y grad = ", y_grad)
+    print("_test_activation_balancer_sign: x grad = ", x.grad)
+def _test_activation_balancer_magnitude():
+    magnitudes = torch.arange(0, 1, 0.01)
+    N = 1000
+    x = torch.sign(torch.randn(magnitudes.numel(), N)) * magnitudes.unsqueeze(-1)
+    x = x.detach()
+    x.requires_grad = True
+    m = ActivationBalancer(
+        magnitudes.numel(),
+        channel_dim=0,
+        min_positive=0.0,
+        max_positive=1.0,
+        max_factor=0.2,
+        min_abs=0.2,
+        max_abs=0.8,
+        min_prob=1.0,
+    )
+    y_grad = torch.sign(torch.randn(magnitudes.numel(), N))
+    y = m(x)
+    y.backward(gradient=y_grad)
+    print("_test_activation_balancer_magnitude: x = ", x)
+    print("_test_activation_balancer_magnitude: y grad = ", y_grad)
+    print("_test_activation_balancer_magnitude: x grad = ", x.grad)
+def _test_basic_norm():
+    num_channels = 128
+    m = BasicNorm(num_channels=num_channels, channel_dim=1)
+    x = torch.randn(500, num_channels)
+    y = m(x)
+    assert y.shape == x.shape
+    x_rms = (x**2).mean().sqrt()
+    y_rms = (y**2).mean().sqrt()
+    print("x rms = ", x_rms)
+    print("y rms = ", y_rms)
+    assert y_rms < x_rms
+    assert y_rms > 0.5 * x_rms
+def _test_double_swish_deriv():
+    x = torch.randn(10, 12, dtype=torch.double) * 3.0
+    x.requires_grad = True
+    m = DoubleSwish()
+    tol = (1.2 - (-0.043637)) / 255.0
+    torch.autograd.gradcheck(m, x, atol=tol)
+    # for self-test.
+    x = torch.randn(1000, 1000, dtype=torch.double) * 3.0
+    x.requires_grad = True
+    y = m(x)
+def _test_softmax():
+    a = torch.randn(2, 10, dtype=torch.float64)
+    b = a.clone()
+    a.requires_grad = True
+    b.requires_grad = True
+    a.softmax(dim=1)[:, 0].sum().backward()
+    print("a grad = ", a.grad)
+    softmax(b, dim=1)[:, 0].sum().backward()
+    print("b grad = ", b.grad)
+    assert torch.allclose(a.grad, b.grad)
+if __name__ == "__main__":
+    logging.getLogger().setLevel(logging.INFO)
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
+    _test_softmax()
+    _test_whiten()
+    _test_max_eig()
+    _test_activation_balancer_sign()
+    _test_activation_balancer_magnitude()
+    _test_basic_norm()
+    _test_double_swish_deriv()