qcw2333
/

YingLong_50m

+{
+  "architectures": [
+    "YingLong"
+  ],
+  "auto_map": {
+    "AutoConfig": "model_config.YingLongConfig",
+    "AutoModelForCausalLM": "model.GPT"
+  },
+  "_mlp_class": "LLaMAMLP",
+  "_norm_class": "FusedRMSNorm",
+  "average_2": false,
+  "betas": [
+    0.9,
+    0.95
+  ],
+  "bias": false,
+  "block_size": 8224,
+  "block_size_val": 2080,
+  "condense_ratio": 1,
+  "decay_lr": true,
+  "discount": true,
+  "eval_iters": 100,
+  "eval_step_interval": 1000,
+  "forecasting_patch": 1,
+  "global_batch_size": 512,
+  "grad_clip": 1.0,
+  "group": "70m-test_all",
+  "haar_loss_match": false,
+  "haar_trans": true,
+  "haar_trans_inv": true,
+  "haar_trans_norm": "backward",
+  "half_diff": false,
+  "imputation": false,
+  "inner_norm": false,
+  "inter_control": false,
+  "intermediate_size": 2048,
+  "is_diff": false,
+  "is_smape": false,
+  "learning_rate": 0.0005,
+  "log_step_interval": 10,
+  "max_step": 100000,
+  "mean_replace": false,
+  "micro_batch_size": 128,
+  "micro_batch_size_val": 512,
+  "min_lr": 1e-05,
+  "mix_train": false,
+  "multi_loss": false,
+  "n_cot": 1,
+  "n_embd": 512,
+  "n_head": 16,
+  "n_layer": 8,
+  "n_query_groups": 4,
+  "name": "50m-unet",
+  "new_arch": false,
+  "new_tokenizer": false,
+  "norm_eps": 1e-05,
+  "num_of_devices": 4,
+  "num_of_nodes": 1,
+  "org": "Ali-Could",
+  "ou": false,
+  "ou_mean": false,
+  "ou_prod": false,
+  "padded_vocab_size": "None",
+  "padding_multiple": 1,
+  "parallel_residual": false,
+  "patch_size": 32,
+  "pid": false,
+  "quantitle": true,
+  "rollback_win": 256,
+  "rolling_patch": 6,
+  "rope_base": 10000,
+  "rotary_percentage": 1.0,
+  "save_step_interval": 10000,
+  "scaling": true,
+  "seed0": 3407,
+  "shared_attention_norm": false,
+  "stats_encoding": false,
+  "stats_encoding_new": false,
+  "sum_divided": false,
+  "triple_diff": false,
+  "triple_diff_new": false,
+  "unet": true,
+  "vocab_size": 1,
+  "vq": false,
+  "warmup_steps": 2000,
+  "weight_decay": 0.1,
+  "yj_trans": false
+}

model.py ADDED Viewed

	@@ -0,0 +1,1713 @@

+"""Full definition of a GPT NeoX Language Model, all of it in this single file.
+Based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT and
+https://github.com/EleutherAI/gpt-neox/tree/main/megatron/model.
+"""
+import math, random
+import numpy as np
+from typing import Any, List, Optional, Tuple
+import torch
+import torch.nn as nn
+from lightning_utilities.core.imports import RequirementCache
+from typing_extensions import Self
+from flash_attn import flash_attn_func
+# from lit_gpt.config import Config
+from xformers.ops import SwiGLU
+import torch.nn.functional as F
+# from .fused_rotary_embedding import apply_rotary_emb_func
+RoPECache = Tuple[torch.Tensor, torch.Tensor]
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+PretokenCache = torch.Tensor
+# Tuple[torch.Tensor, torch.Tensor]
+FlashAttention2Available = RequirementCache("flash-attn>=2.0.0.post1")
+from einops import rearrange
+from transformers import PreTrainedModel, Cache, DynamicCache
+from huggingface_hub import PyTorchModelHubMixin
+from .model_config import YingLongConfig
+# from torch.distributions import Normal, LowRankMultivariateNormal, kl_divergence,MultivariateNormal
+class quantitleLoss(torch.nn.Module):
+    def __init__(self,
+                 qSize = 99,
+                 patch_size = 16,
+                 *args,**kwargs) -> None:
+        super().__init__()
+        self.qSize = qSize
+        self.patch_size = patch_size
+        q = np.array([i+1 for i in range(self.qSize)])
+        q = q / (self.qSize + 1)
+        q = q.reshape((1,1,-1))
+        q_variance = q*(1-q)
+        self.register_buffer('q', torch.tensor(q))
+        self.register_buffer('q_variance', torch.tensor(q_variance))
+    def forward(self, input: torch.Tensor, target: torch.Tensor,rel_loss = False) -> torch.Tensor:
+        target = target.unsqueeze(-1)
+        input = input[:,:target.shape[1],:,:]
+        posPart = input - target
+        negPart = -posPart
+        raw_loss = torch.maximum(self.q * negPart, (1-self.q) * posPart)
+        target_absmean = torch.mean(target.abs(),dim = (1,2),keepdims = True)
+        raw_loss = raw_loss  / torch.sqrt(self.q_variance)  / (target_absmean + 1e-4)
+        return torch.mean(raw_loss)
+def haarMatrix_unnormalized(n):
+    # Allow only size n of power 2
+    n = 2**np.ceil(np.log2(n))
+    if n > 2:
+        h = haarMatrix(n / 2)
+    else:
+        return np.array([[1, 1], [1, -1]])
+    # calculate upper haar part
+    h_n = np.kron(h, [1, 1])
+    # calculate lower haar part
+    # if normalized:
+        # h_i = np.sqrt(n/2)*np.kron(np.eye(len(h)), [1, -1])
+    # else:
+    h_i = np.kron(np.eye(len(h)), [1, -1])
+    # combine parts
+    h = np.vstack((h_n, h_i))
+    return h
+def haarMatrix(n,normalized = 'ortho'):
+    h = haarMatrix_unnormalized(n)
+    scaler = np.diag(1/np.sqrt(np.diag([email protected]())))
+    if normalized == 'ortho':
+        return scaler @ h
+    elif normalized == 'forward':
+        return scaler @ h/ np.sqrt(n)
+    else:
+        return scaler @ h *  np.sqrt(n)
+    # else:
+        # scaler = 1
+class Tokenizer(torch.nn.Module):
+    def __init__(self, config: YingLongConfig, *args,**kwargs) -> None:
+        super().__init__()
+        self.config = config
+        self.tokenizer = nn.Linear(config.patch_size,self.config.n_embd)
+        self.patch_size = config.patch_size
+        self.mask0 = nn.Linear(1,config.n_embd)
+        self.register_buffer('mask_token', torch.zeros(1000))
+        if self.config.haar_trans:
+            self.register_buffer('haar_transform',torch.Tensor(haarMatrix(self.config.patch_size,normalized = self.config.haar_trans_norm)))
+    def forward(self,x,
+                future_token = 0,
+                prev_token = 0,
+                factor = 0.2,
+                sequential = False,
+                *args, **kwargs):
+        b = x.shape[0]
+        x_raw = rearrange(x, "b (l c) -> b l c", c = self.patch_size)
+        x_raw_0 = rearrange(x, "b (l c) -> b l c", c = self.patch_size).detach().clone()
+        if future_token == 0:
+            if not sequential:
+                masks = torch.randperm(x_raw.shape[1])
+                unmasks,masks = masks[:int(x_raw.shape[1]*factor)],masks[int(x_raw.shape[1]*factor):]
+            else:
+                masks = [_ for _ in range(x_raw.shape[1])]
+                factor = np.random.rand()*0.6 + 0.2
+                unmasks,masks = masks[:int(x_raw.shape[1]*factor)],masks[int(x_raw.shape[1]*factor):]
+            x_raw_remains = x_raw[:,unmasks,:]
+            mean = x_raw_remains.mean(dim = (-2,-1),keepdims = True)
+            std = x_raw_remains.std(dim = (-2,-1),keepdims = True)
+            x_raw = (x_raw - mean)/ (std + 1e-4)
+            if self.config.haar_trans:
+                x_featured = torch.einsum('blc,ac->bla',x_raw,self.haar_transform)
+                x_featured = self.tokenizer(x_featured)
+            else:
+                x_featured = self.tokenizer(x_raw)
+            x_featured[:,masks,:] = self.mask0(self.mask_token[0].unsqueeze(0))
+        else:
+            factor = 1
+            more_rows = future_token // self.patch_size + 1
+            prev_more_rows = prev_token // self.patch_size + 1
+            mean = x_raw[:,prev_more_rows:-more_rows,:].mean(dim = (-2,-1),keepdims = True)
+            std = x_raw[:,prev_more_rows:-more_rows,:].std(dim = (-2,-1),keepdims = True)
+            x_raw = (x_raw - mean)/ (std + 1e-4)
+            if self.config.haar_trans:
+                x_featured = torch.einsum('blc,ac->bla',x_raw,self.haar_transform)
+                x_featured = self.tokenizer(x_featured)
+            else:
+                x_featured = self.tokenizer(x_raw)
+            masks = [jj for jj in range(x_featured.shape[1])]
+            masks = masks[-more_rows:]
+            # if not mean_replace:
+            x_featured[:,-more_rows:] = self.mask0(self.mask_token[:len(masks)].unsqueeze(-1)).repeat(x_featured.shape[0],1,1)
+            x_featured[:,:prev_more_rows] = self.mask0(self.mask_token[:prev_more_rows].unsqueeze(-1)).repeat(x_featured.shape[0],1,1)
+        return x_featured, x_raw_0, masks, mean, std, x_raw
+class model_tmp(PreTrainedModel):
+    config_class = YingLongConfig
+    base_model_prefix = "model"
+    # supports_gradient_checkpointing = True
+    # _no_split_modules = ["TimeMoeDecoderLayer"]
+    # _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = False
+    _supports_cache_class = True
+# class GPT(nn.Module,PreTrainedModel,PyTorchModelHubMixin):
+class GPT(model_tmp):
+    def __init__(self, config: YingLongConfig, *args,**kwargs) -> None:
+        # config_class = YingLongConfig
+    #     base_model_prefix = "model"
+    # # supports_gradient_checkpointing = True
+    # # _no_split_modules = ["TimeMoeDecoderLayer"]
+    # # _skip_keys_device_placement = "past_key_values"
+    #     _supports_flash_attn_2 = True
+    #     _supports_sdpa = False
+    #     _supports_cache_class = True
+        super().__init__(config)
+        self.config = config
+        self.patch_size = config.patch_size
+        self.unet = config.unet
+        if self.config._norm_class == "RMSNorm":
+            # from .model import RMSNorm
+            self.config.norm_class =  RMSNorm
+        elif self.config._norm_class == "FusedRMSNorm":
+            # from .model import FusedRMSNorm
+            self.config.norm_class =  FusedRMSNorm
+        elif self.config._norm_class == 'BatchNorm':
+            # from .model import iBatchNorm
+            self.config.norm_class =  iBatchNorm
+        if self.config._mlp_class == "GptNeoxMLP":
+            # from .model import GptNeoxMLP
+            self.config.mlp_class =  GptNeoxMLP
+        elif self.config._mlp_class == "LLaMAMLP":
+            # from .model import LLaMAMLP
+            self.config.mlp_class =  LLaMAMLP
+        if config.stats_encoding:
+            self.stat_tokens = 1
+        else:
+            self.stat_tokens = 0
+        self.tokenizer = Tokenizer(config)
+        # self.lm_head = nn.Sequential(config.norm_class(config.n_embd, eps=config.norm_eps),
+        #                             nn.Linear(config.n_embd, config.n_embd*4),
+        #                             nn.ReLU(),
+        #                             nn.Linear(config.n_embd*4, 99*self.patch_size),
+        #                             )
+        self.lm_head =  nn.Linear(config.n_embd, 99*self.patch_size)
+        # self.gate = nn.Linear(config.n_embd, 1)
+        self.quantitleLoss = quantitleLoss(99,patch_size = self.patch_size)
+        if self.unet:
+            assert config.n_layer%2 == 0
+            self.unet_projection = nn.ModuleList(nn.Sequential(nn.Linear(config.n_embd*2,config.n_embd),
+                                                            config.norm_class(config.n_embd, eps=config.norm_eps),
+                                                            )
+                                            for  _ in range(config.n_layer//2)
+                                           )
+            self.unet_merge = nn.ModuleList(nn.Sequential(nn.Linear(config.n_embd*2,config.n_embd),
+                                                            config.norm_class(config.n_embd, eps=config.norm_eps),
+                                                            )
+                                            for  _ in range(config.n_layer//2)
+                                           )
+        self.transformer = nn.ModuleDict(dict(h = nn.ModuleList(Block(config)
+                                          for _ in range(config.n_layer))
+                                             )
+                                        )
+        self.rope_cache: Optional[RoPECache] = None
+        self.mask_cache: Optional[torch.Tensor] = None
+        self.kv_caches: List[KVCache] = []
+    def _init_weights(self, module: nn.Module) -> None:
+        """Meant to be used with `gpt.apply(gpt._init_weights)`."""
+        # GPT-NeoX  https://arxiv.org/pdf/2204.06745.pdf
+        if isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=math.sqrt(2.0 / 5 / self.config.n_embd))
+            # RWKV: set it to 1e-4
+            # torch.nn.init.uniform_(module.weight,  -1e-4, 1e-4)
+        elif isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=math.sqrt(2.0 / 5 / self.config.n_embd))
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        # GPT-NeoX
+        for name, p in module.named_parameters():
+            if (name == "proj.weight" and isinstance(module, LLaMAMLP)) or (name == "w3.weight" and isinstance(module, SwiGLU) or (name=="proj.weight" and isinstance(module, BidirectedlSelfAttention))):  #if use xformer swiglu, fc2 layer will be renamed to w3
+                nn.init.normal_(p, mean=0.0, std=1 / math.sqrt(self.config.n_embd)  /  self.config.n_layer)
+    def reset_cache(self) -> None:
+        self.kv_caches.clear()
+        if self.mask_cache is not None and self.mask_cache.device.type == "xla":
+            # https://github.com/Lightning-AI/lit-gpt/pull/83#issuecomment-1558150179
+            self.rope_cache = None
+            self.mask_cache = None
+    def forward(
+        self, idx: torch.Tensor,
+        max_seq_length: Optional[int] = None,
+        input_pos: Optional[torch.Tensor] = None,
+        next_token: torch.Tensor = None,
+        future_token: int = 0,
+        prev_token: int = 0,
+        val: bool = False,
+        print_intermediate: bool = False,
+        cot_rounds: int = -1,
+        sequential: bool = False,
+        *args,**kwargs,
+    ) -> torch.Tensor:
+        if future_token > 0:
+            more_rows = future_token // self.patch_size + 1
+            idx = torch.cat((idx,torch.zeros(idx.shape[0],more_rows*self.patch_size).to(idx.device)),dim = -1).bfloat16()
+        if prev_token > 0:
+            more_rows = prev_token // self.patch_size + 1
+            idx = torch.cat((torch.zeros(idx.shape[0],more_rows*self.patch_size).to(idx.device),idx),dim = -1).bfloat16()
+        B, T = idx.size()
+        use_kv_cache = input_pos is not None
+        block_size = self.config.block_size
+        if max_seq_length is None:
+            max_seq_length = block_size
+        if use_kv_cache:  # not relevant otherwise
+            assert (
+                max_seq_length >= T
+            ), f"Cannot forward sequence of length {T}, max seq length is only {max_seq_length}"
+        assert max_seq_length <= block_size, f"Cannot attend to {max_seq_length}, block size is only {block_size}"
+        if self.rope_cache is None:
+            self.rope_cache = self.build_rope_cache(idx)
+        if use_kv_cache and self.mask_cache is None:
+            self.mask_cache = self.build_mask_cache(idx)
+        cos, sin = self.rope_cache
+        if use_kv_cache:
+            if self.stat_tokens:
+                if len(input_pos) == 1:
+                    idx = idx[:,input_pos]
+                    input_pos = input_pos.add_(1)
+                else:
+                    input_pos = torch.arange(0, input_pos[-1]+2, device=idx.device)
+                cos = cos.index_select(0, input_pos)
+                sin = sin.index_select(0, input_pos)
+                mask = self.mask_cache.index_select(2, input_pos)
+                mask = mask[:, :, :, :max_seq_length]
+            else:
+                cos = cos.index_select(0, input_pos)
+                sin = sin.index_select(0, input_pos)
+                idx = idx[:,input_pos]
+        else:
+            cos = cos[:max(T,1024) + self.stat_tokens]
+            sin = sin[:max(T,1024) +  self.stat_tokens]
+            mask = None
+        idx_ori = idx
+        if use_kv_cache:
+            pass
+        else:
+            x,x_raw,masks,mean,std,x_0 =  self.tokenizer(idx,
+                                                         future_token =future_token,
+                                                         prev_token = prev_token,
+                                                         sequential = sequential,
+                                                        )
+        if self.unet:
+            skips = []
+        res_intermediate = []
+        target_intermediate = []
+        if not use_kv_cache:
+            if cot_rounds <0:
+                cot_rounds = self.config.n_cot
+            res_list = []
+            gate_list = []
+            for rep in range(cot_rounds):
+                for block_idx in range(len( self.transformer.h)):
+                    block =  self.transformer.h[block_idx]
+                    if self.unet and block_idx >=len(self.transformer.h) //2:
+                        x = self.unet_projection[block_idx - len(self.transformer.h) //2](torch.cat((skips.pop(),x),dim = -1))
+                    x, *_ = block(x, (cos, sin), max_seq_length)
+                    if self.unet and block_idx <len(self.transformer.h) //2:
+                        skips.append(x)
+                        x_delay = torch.cat((x[:,0,:].unsqueeze(1),x[:,:-1,:]),dim = 1)
+                        x = self.unet_merge[block_idx](torch.cat((x_delay,x),dim = -1))
+                    # if block_idx <len(self.transformer.h) //2:
+                        # x_delay = torch.cat((x[:,0,:].unsqueeze(1),x[:,:-1,:]),dim = 1)
+                        # x = self.unet_merge[block_idx](torch.cat((x_delay,x),dim = -1))
+                # res_list.append(self.lm_head(x).unsqueeze(-1))
+                # gate_list.append(self.gate(x).unsqueeze(-1))
+                    # gate_list.append(self.gate(x))
+                # if print_intermediate:
+                    # res_intermediate.append(res_list[-1])
+                # if print_intermediate:
+#                     res_tmp = self.lm_head(x[:,self.stat_tokens:])
+#                     res_tmp = rearrange(res_tmp,'b c (l1 l2) -> b c l1 l2', l2 = 99)
+#                     if self.config.haar_trans_inv:
+#                         res_tmp = torch.einsum('bcal,ad->bcdl',res_tmp,self.tokenizer.haar_transform)
+#                         if self.config.haar_trans_norm == "backward":
+#                             res_tmp = res_tmp / np.sqrt(res_tmp.shape[-2])
+#                         elif self.config.haar_trans_norm == "forward":
+#                             res_tmp = res_tmp * np.sqrt(res_tmp.shape[-2])
+#                     res_tmp = res_tmp * (std.unsqueeze(-1) + 1e-4) + mean.unsqueeze(-1)
+#                     res_intermediate.append(res_tmp[:,masks,:,:])
+        else:
+            self.kv_caches = self.kv_caches or self.build_kv_caches(x, max_seq_length, cos.size(-1) * 2)
+            for block_idx in range(len( self.transformer.h)):
+                block =  self.transformer.h[block_idx]
+                if self.unet and block_idx >=len(self.transformer.h) //2:
+                    x = F.silu(skips.pop()) * x
+                x, self.kv_caches[block_idx] = block(x, (cos, sin), max_seq_length, mask, input_pos, self.kv_caches[block_idx])
+                if self.unet and block_idx <len(self.transformer.h) //2:
+                    skips.append(x)
+        res = self.lm_head(x)
+        # gate = torch.cat(gate_list,dim = -1)
+        # gate = F.softmax(gate,dim = -1)
+        # res = torch.cat(res_list,dim = -1) * gate
+        # res = res.sum(dim = -1)
+        res = rearrange(res,'b c (l1 l2) -> b c l1 l2', l2 = 99)
+        if self.config.haar_trans_inv:
+            # print('res',res.shape,self.tokenizer.haar_transform.shape)
+            res = torch.einsum('bcal,ad->bcdl',res,self.tokenizer.haar_transform)
+            if self.config.haar_trans_norm == "backward":
+                res = res / np.sqrt(res.shape[-2])
+            elif self.config.haar_trans_norm == "forward":
+                res = res * np.sqrt(res.shape[-2])
+        res = res * (std.unsqueeze(-1) + 1e-4) + mean.unsqueeze(-1)
+        if future_token == 0:
+            return res[:,masks,:,:], x_raw[:,masks,:],res_intermediate,target_intermediate
+        else:
+            return res[:,masks,:,:],res_intermediate
+    def generate(self,*args,**kwargs):
+        res, _ = self.forward(*args,**kwargs)
+        # logits_all,res_intermediate = model(idx = x_train, future_token = (pred_len//32  + 1)* 32, prev_token = 0,print_intermediate = False,cot_rounds = 1)
+        res = rearrange(res, 'b l c d -> b (l c) d')
+        return res[:,:kwargs['future_token'],:]
+    @classmethod
+    def from_name(cls, name: str, **kwargs: Any) -> Self:
+        return cls(Config.from_name(name, **kwargs))
+    def build_rope_cache(self, idx: torch.Tensor) -> RoPECache:
+        return build_rope_cache(
+            seq_len=self.config.block_size +  self.stat_tokens,
+            n_elem=int(self.config.rotary_percentage * self.config.head_size),
+            dtype=torch.bfloat16,
+            device=idx.device,
+            base = self.config.rope_base,
+            condense_ratio=self.config.condense_ratio,
+        )
+    def build_mask_cache(self, idx: torch.Tensor) -> torch.Tensor:
+        ones = torch.ones((self.config.block_size+self.stat_tokens, self.config.block_size+self.stat_tokens), device=idx.device, dtype=torch.bool)
+        return torch.tril(ones).unsqueeze(0).unsqueeze(0)
+    def build_kv_caches(self, idx: torch.Tensor, max_seq_length: int, rope_cache_length: int) -> List[KVCache]:
+        B = idx.size(0)
+        heads = 1 if self.config.n_query_groups == 1 else self.config.n_query_groups
+        k_cache_shape = (
+            B,
+            max_seq_length,
+            heads,
+            rope_cache_length + self.config.head_size - int(self.config.rotary_percentage * self.config.head_size),
+        )
+        v_cache_shape = (B, max_seq_length, heads, self.config.head_size)
+        device = idx.device
+        return [
+            (torch.zeros(k_cache_shape, device=device), torch.zeros(v_cache_shape, device=device))
+            for _ in range(self.config.n_layer)
+        ]
+class Block(nn.Module):
+    def __init__(self, config:YingLongConfig) -> None:
+        super().__init__()
+        self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps)
+        self.attn = BidirectedlSelfAttention(config)
+        if not config.shared_attention_norm:
+            self.norm_2 = config.norm_class(config.n_embd, eps=config.norm_eps)
+        self.mlp = config.mlp_class(config)
+        self.config = config
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: RoPECache,
+        max_seq_length: int,
+        mask: Optional[torch.Tensor] = None,
+        input_pos: Optional[torch.Tensor] = None,
+        kv_cache: Optional[KVCache] = None,
+    ) -> Tuple[torch.Tensor, Optional[KVCache]]:
+        n_1 = self.norm_1(x)
+        h, new_kv_cache = self.attn(n_1, rope, max_seq_length, mask, input_pos, kv_cache)
+        if self.config.parallel_residual:
+            n_2 = n_1 if self.config.shared_attention_norm else self.norm_2(x)
+            x = x + h + self.mlp(n_2)
+        else:
+            if self.config.shared_attention_norm:
+                raise NotImplementedError(
+                    "No checkpoint amongst the ones we support uses this configuration"
+                    " (non-parallel residual and shared attention norm)."
+                )
+            x = x + h
+            x = x + self.mlp(self.norm_2(x))
+        return x, new_kv_cache
+class BidirectedlSelfAttention(nn.Module):
+    def __init__(self, config:YingLongConfig) -> None:
+        super().__init__()
+        shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
+        # key, query, value projections for all heads, but in a batch
+        self.attn = nn.Linear(config.n_embd, shape, bias=config.bias)
+        # output projection
+        self.proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.config = config
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: RoPECache,
+        max_seq_length: int,
+        mask: Optional[torch.Tensor] = None,
+        input_pos: Optional[torch.Tensor] = None,
+        kv_cache: Optional[KVCache] = None,
+    ) -> Tuple[torch.Tensor, Optional[KVCache]]:
+        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+        qkv = self.attn(x)
+        # assemble into a number of query groups to support MHA, MQA and GQA together (see `config.n_query_groups`)
+        q_per_kv = self.config.n_head // self.config.n_query_groups
+        total_qkv = q_per_kv + 2  # each group has 1+ queries, 1 key, and 1 value
+        qkv = qkv.view(B, T, self.config.n_query_groups, total_qkv, self.config.head_size) # (B, T, n_query_groups, total_qkv, hs)
+        # qkv = qkv.permute(0, 2, 3, 1, 4)  # (B, n_query_groups, total_qkv, T, hs)
+        # split batched computation into three
+        q, k, v = qkv.split((q_per_kv, 1, 1), dim=-2)
+        # repeat k and v if necessary
+        # Peiyuan: we do not need to do this as flash attention 2 already support GQA
+        # if self.config.n_query_groups != 1:  # doing this would require a full kv cache with MQA (inefficient!)
+        #     # for MHA this is a no-op
+        #     k = k.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size)
+        #     v = v.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size)
+        q = q.reshape(B,  T, -1, self.config.head_size)  # (B, T, nh_q, hs)
+        k = k.reshape(B,  T, -1, self.config.head_size)
+        v = v.reshape(B,  T, -1, self.config.head_size)
+        cos, sin = rope
+        # apply rope in fp32 significanly stabalize training
+        # fused rope expect (batch_size, seqlen, nheads, headdim)
+        q = apply_rotary_emb_func(q, cos, sin, False, True)
+        k = apply_rotary_emb_func(k, cos, sin, False, True)
+        # n_elem = int(self.config.rotary_percentage * self.config.head_size)
+        # q_roped = apply_rope(q[..., :n_elem], cos.repeat(1,2), sin.repeat(1,2))
+        # k_roped = apply_rope(k[..., :n_elem], cos.repeat(1,2), sin.repeat(1,2))
+        # print( (q_roped - q).sum())
+        # q = torch.cat((q_roped, q[..., n_elem:]), dim=-1)
+        # k = torch.cat((k_roped, k[..., n_elem:]), dim=-1)
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            cache_k, cache_v = cache_k.to(dtype=k.dtype), cache_v.to(dtype=v.dtype)
+            # check if reached token limit
+            if input_pos[-1] >= max_seq_length:
+                input_pos = torch.tensor(max_seq_length - 1, device=input_pos.device)
+                # shift 1 position to the left
+                cache_k = torch.roll(cache_k, -1, dims=1)
+                cache_v = torch.roll(cache_v, -1, dims=1)
+            k = cache_k.index_copy_(1, input_pos, k)
+            v = cache_v.index_copy_(1, input_pos, v)
+            kv_cache = k, v
+        y = self.scaled_dot_product_attention(q, k, v, mask=mask)
+        y = y.reshape(B, T, C)  # re-assemble all head outputs side by side
+        # output projection
+        y = self.proj(y)
+        return y, kv_cache
+    def scaled_dot_product_attention(
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ):
+        scale = 1.0 / math.sqrt(self.config.head_size)
+        if (
+            FlashAttention2Available
+            and mask is None
+            and q.device.type == "cuda"
+            and q.dtype in (torch.float16, torch.bfloat16)
+        ):
+            from flash_attn import flash_attn_func
+            return flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=scale, causal=False)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        if q.size() != k.size():
+             k = k.repeat_interleave(q.shape[1]//k.shape[1], dim=1)
+             v = v.repeat_interleave(q.shape[1]//v.shape[1], dim=1)
+        y = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=mask, dropout_p=0.0, scale=scale, is_causal=False
+        )
+        return y.transpose(1, 2)
+class GptNeoxMLP(nn.Module):
+    def __init__(self, config:YingLongConfig) -> None:
+        super().__init__()
+        self.fc = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc(x)
+        x = torch.nn.functional.gelu(x)
+        return self.proj(x)
+class LLaMAMLP(nn.Module):
+    def __init__(self, config:YingLongConfig) -> None:
+        super().__init__()
+        # self.fc_1 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        # self.fc_2 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        # self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias)
+        self.swiglu = SwiGLU(config.n_embd,config.intermediate_size, bias=False, _pack_weights=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x_fc_1 = self.fc_1(x)
+        # x_fc_2 = self.fc_2(x)
+        # x = torch.nn.functional.silu(x_fc_1) * x_fc_2
+        # return self.proj(x)
+        return self.swiglu(x)
+def build_rope_cache(
+    seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000, condense_ratio: int = 1
+) -> RoPECache:
+    """Enhanced Transformer with Rotary Position Embedding.
+    Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+    transformers/rope/__init__.py. MIT License:
+    https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+    """
+    # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device) / n_elem))
+    # Create position indexes `[0, 1, ..., seq_len - 1]`
+    seq_idx = torch.arange(seq_len, device=device) / condense_ratio
+    # Calculate the product of position index and $\theta_i$
+    idx_theta = torch.outer(seq_idx, theta)
+    cos, sin = torch.cos(idx_theta), torch.sin(idx_theta)
+    # print('    print(seq_idx.shape,theta.shape,sin.shape,cos.shape,idx_theta.shape)',seq_idx.shape,theta.shape,sin.shape,cos.shape,idx_theta.shape)
+    # added by peiyuan to ensure same data type with q, k, to use fused rotary embedding
+    if dtype == torch.bfloat16:
+        return cos.bfloat16(), sin.bfloat16()
+    # this is to mimic the behaviour of complex32, else we will get different results
+    if dtype in (torch.float16, torch.bfloat16, torch.int8):
+        return cos.half(), sin.half()
+    return cos, sin
+def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    head_size = x.size(-1)
+    x1 = x[..., : head_size // 2]  # (B, nh, T, hs/2)
+    x2 = x[..., head_size // 2 :]  # (B, nh, T, hs/2)
+    rotated = torch.cat((-x2, x1), dim=-1)  # (B, nh, T, hs)
+    roped = (x * cos) + (rotated * sin)
+    return roped.type_as(x)
+import torch
+# Copyright (c) 2022, Tri Dao.
+# Adapted from https://github.com/NVIDIA/apex/blob/master/apex/contrib/layer_norm/layer_norm.py AND https://github.com/Dao-AILab/flash-attention/blob/7a983df74215e035e566e37125b0a71e3618f39d/flash_attn/ops/layer_norm.py#L16
+import dropout_layer_norm
+import torch
+from torch.nn import init
+def maybe_align(x, alignment_in_bytes=16):
+    """Assume that x already has last dim divisible by alignment_in_bytes"""
+    # TD [2023-07-04] I'm not 100% sure that clone will align the memory
+    # https://discuss.pytorch.org/t/how-to-ensure-that-tensor-data-ptr-is-aligned-to-16-bytes/183440
+    return x if x.data_ptr() % alignment_in_bytes == 0 else x.clone()
+def _dropout_add_layer_norm_forward(
+    x0,
+    residual,
+    gamma,
+    beta,
+    rowscale,
+    colscale,
+    dropout_p,
+    epsilon,
+    residual_in_fp32=False,
+    is_rms_norm=False,
+):
+    """Assume that arguments are contiguous and aligned to 16 bytes"""
+    hidden_size = gamma.numel()
+    x0mat = x0.view((-1, hidden_size))
+    residualmat = residual.view((-1, hidden_size)) if residual is not None else None
+    rowscale = rowscale.view(-1) if rowscale is not None else None
+    zmat, xmat, dmask, mu, rsigma = dropout_layer_norm.dropout_add_ln_fwd(
+        x0mat,
+        residualmat,
+        gamma,
+        beta,
+        rowscale,
+        colscale,
+        None,
+        None,
+        dropout_p,
+        epsilon,
+        1.0,
+        0,
+        None,
+        residual_in_fp32,
+        is_rms_norm,
+    )
+    # dmask is None if dropout_p == 0.0
+    # xmat is None if dropout_p == 0.0 and residual is None and residual_dtype != input_dtype
+    return zmat, xmat if xmat is not None else x0mat, dmask, mu, rsigma
+def _dropout_add_layer_norm_backward(
+    dz,
+    dx,
+    x,
+    x0,
+    dmask,
+    mu,
+    rsigma,
+    gamma,
+    rowscale,
+    colscale,
+    dropout_p,
+    has_residual,
+    is_rms_norm=False,
+):
+    """Assume that arguments are contiguous and aligned to 16 bytes
+    dx == None means that it was a post-norm architecture
+    (x = drop(x0) + residual was not returned in the fwd).
+    x0 must not be None if we have colscale.
+    """
+    hidden_size = gamma.numel()
+    xmat = x.view((-1, hidden_size))
+    dzmat = dz.view(xmat.shape)
+    dxmat = dx.view(xmat.shape) if dx is not None else None
+    x0mat = x0.view((-1, hidden_size)) if x0 is not None else None
+    rowscale = rowscale.view(-1) if rowscale is not None else None
+    if colscale is not None:
+        assert x0 is not None, "x0 is required to compute the gradient of colscale"
+    dx0mat, dresidualmat, dgamma, dbeta, _, _, *rest = dropout_layer_norm.dropout_add_ln_bwd(
+        dzmat,
+        dxmat,
+        xmat,
+        x0mat,
+        dmask,
+        mu,
+        rsigma,
+        gamma,
+        rowscale,
+        colscale,
+        None,
+        None,
+        dropout_p,
+        1.0,
+        0,
+        has_residual,
+        is_rms_norm,
+    )
+    # dresidualmat is None if not has_residual
+    if colscale is None:
+        return dx0mat, dresidualmat, dgamma, dbeta
+    else:
+        dcolscale = rest[0]
+        return dx0mat, dresidualmat, dgamma, dbeta, dcolscale
+def _dropout_add_layer_norm_subset_forward(
+    x0,
+    residual,
+    gamma,
+    beta,
+    colscale,
+    x0_subset,
+    out_subset,
+    dropout_p,
+    epsilon,
+    rowscale_const,
+    out_numrows,
+    residual_in_fp32=False,
+    is_rms_norm=False,
+):
+    """Assume that arguments are contiguous and aligned to 16 bytes"""
+    hidden_size = gamma.numel()
+    x0mat = x0.view((-1, hidden_size))
+    residualmat = residual.view((-1, hidden_size)) if residual is not None else None
+    x0_subset = x0_subset.view(-1) if x0_subset is not None else None
+    out_subset = out_subset.view(-1) if out_subset is not None else None
+    zmat, xmat, dmask, mu, rsigma = dropout_layer_norm.dropout_add_ln_fwd(
+        x0mat,
+        residualmat,
+        gamma,
+        beta,
+        None,
+        colscale,
+        x0_subset,
+        out_subset,
+        dropout_p,
+        epsilon,
+        rowscale_const,
+        out_numrows,
+        None,
+        residual_in_fp32,
+        is_rms_norm,
+    )
+    # dmask is None if dropout_p == 0.0
+    # xmat is None if dropout_p == 0.0 and residual is None and residual_dtype != input_dtype
+    return zmat, xmat if xmat is not None else x0mat, dmask, mu, rsigma
+def _dropout_add_layer_norm_subset_backward(
+    dz,
+    dx,
+    x,
+    x0,
+    dmask,
+    mu,
+    rsigma,
+    gamma,
+    colscale,
+    x0_subset,
+    out_subset,
+    dropout_p,
+    rowscale_const,
+    x0_numrows,
+    has_residual,
+    is_rms_norm=False,
+):
+    """Assume that arguments are contiguous and aligned to 16 bytes
+    dx == None means that it was a post-norm architecture
+    (x = drop(x0) + residual was not returned in the fwd).
+    x0 must not be None if we have colscale.
+    """
+    hidden_size = gamma.numel()
+    xmat = x.view((-1, hidden_size))
+    dzmat = dz.view(-1, hidden_size)
+    dxmat = dx.view(xmat.shape) if dx is not None else None
+    x0mat = x0.view((-1, hidden_size)) if x0 is not None else None
+    x0_subset = x0_subset.view(-1) if x0_subset is not None else None
+    out_subset = out_subset.view(-1) if out_subset is not None else None
+    if colscale is not None:
+        assert x0 is not None, "x0 is required to compute the gradient of colscale"
+    dx0mat, dresidualmat, dgamma, dbeta, _, _, *rest = dropout_layer_norm.dropout_add_ln_bwd(
+        dzmat,
+        dxmat,
+        xmat,
+        x0mat,
+        dmask,
+        mu,
+        rsigma,
+        gamma,
+        None,
+        colscale,
+        x0_subset,
+        out_subset,
+        dropout_p,
+        rowscale_const,
+        x0_numrows,
+        has_residual,
+        is_rms_norm,
+    )
+    # dresidualmat is None if not has_residual
+    if colscale is None:
+        return dx0mat, dresidualmat, dgamma, dbeta
+    else:
+        dcolscale = rest[0]
+        return dx0mat, dresidualmat, dgamma, dbeta, dcolscale
+def _dropout_add_layer_norm_parallel_residual_forward(
+    x0,
+    x1,
+    residual,
+    gamma0,
+    beta0,
+    gamma1,
+    beta1,
+    dropout_p,
+    epsilon,
+    residual_in_fp32=False,
+    is_rms_norm=False,
+):
+    """Assume that arguments are contiguous and aligned to 16 bytes"""
+    hidden_size = gamma0.numel()
+    x0mat = x0.view((-1, hidden_size))
+    x1mat = x1.view((-1, hidden_size)) if x1 is not None else None
+    residualmat = residual.view((-1, hidden_size)) if residual is not None else None
+    (
+        z0mat,
+        z1mat,
+        xmat,
+        dmask0,
+        dmask1,
+        mu,
+        rsigma,
+    ) = dropout_layer_norm.dropout_add_ln_parallel_residual_fwd(
+        x0mat,
+        x1mat,
+        residualmat,
+        gamma0,
+        beta0,
+        gamma1,
+        beta1,
+        dropout_p,
+        epsilon,
+        None,
+        residual_in_fp32,
+        is_rms_norm,
+    )
+    # dmask0 and dmask1 are None if dropout_p == 0.0
+    # xmat is None if dropout_p == 0.0 and residual is None and residual_dtype != input_dtype
+    return z0mat, z1mat, xmat if xmat is not None else x0mat, dmask0, dmask1, mu, rsigma
+def _dropout_add_layer_norm_parallel_residual_backward(
+    dz0,
+    dz1,
+    dx,
+    x,
+    dmask0,
+    dmask1,
+    mu,
+    rsigma,
+    gamma0,
+    gamma1,
+    dropout_p,
+    has_x1,
+    has_residual,
+    is_rms_norm=False,
+):
+    """Assume that arguments are contiguous and aligned to 16 bytes
+    dx == None means that it was a post-norm architecture
+    (x = drop(x0) + residual was not returned in the fwd).
+    """
+    hidden_size = gamma0.numel()
+    xmat = x.view((-1, hidden_size))
+    dz0mat = dz0.view(xmat.shape)
+    dz1mat = dz1.view(xmat.shape) if dz1 is not None else None
+    dxmat = dx.view(xmat.shape) if dx is not None else None
+    (
+        dx0mat,
+        dx1mat,
+        dresidualmat,
+        dgamma0,
+        dbeta0,
+        dgamma1,
+        dbeta1,
+        *rest,
+    ) = dropout_layer_norm.dropout_add_ln_parallel_residual_bwd(
+        dz0mat,
+        dz1mat,
+        dxmat,
+        xmat,
+        dmask0,
+        dmask1,
+        mu,
+        rsigma,
+        gamma0,
+        gamma1,
+        dropout_p,
+        has_x1,
+        has_residual,
+        is_rms_norm,
+    )
+    # dresidualmat is None if not has_residual
+    return dx0mat, dx1mat, dresidualmat, dgamma0, dbeta0, dgamma1, dbeta1
+class DropoutAddLayerNormFn(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x0,
+        residual,
+        gamma,
+        beta,
+        rowscale,
+        colscale,
+        dropout_p,
+        epsilon,
+        residual_in_fp32=False,
+        prenorm=False,
+        is_rms_norm=False,
+        return_dmask=False,
+    ):
+        x0 = maybe_align(x0.contiguous(), 16)
+        residual = maybe_align(residual.contiguous(), 16) if residual is not None else None
+        gamma = maybe_align(gamma.contiguous(), 16)
+        beta = maybe_align(beta.contiguous(), 16) if beta is not None else None
+        rowscale = maybe_align(rowscale.contiguous(), 16) if rowscale is not None else None
+        colscale = maybe_align(colscale.contiguous(), 16) if colscale is not None else None
+        zmat, xmat, dmask, mu, rsigma = _dropout_add_layer_norm_forward(
+            x0,
+            residual,
+            gamma,
+            beta,
+            rowscale,
+            colscale,
+            dropout_p,
+            epsilon,
+            residual_in_fp32,
+            is_rms_norm,
+        )
+        # Only need to save x0 if we need to compute gradient wrt colscale
+        x0_saved = x0 if colscale is not None else None
+        ctx.save_for_backward(
+            xmat.view(x0.shape), x0_saved, dmask, gamma, mu, rsigma, rowscale, colscale
+        )
+        ctx.prenorm = prenorm
+        ctx.dropout_p = dropout_p
+        ctx.has_residual = residual is not None
+        ctx.is_rms_norm = is_rms_norm
+        ctx.has_beta = beta is not None
+        if not return_dmask:
+            return (
+                zmat.view(x0.shape) if not prenorm else (zmat.view(x0.shape), xmat.view(x0.shape))
+            )
+        else:
+            dmask = (
+                dmask.view(x0.shape)
+                if dropout_p > 0.0
+                else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device)
+            )
+            ctx.mark_non_differentiable(dmask)
+            return (
+                (zmat.view(x0.shape), dmask)
+                if not prenorm
+                else (zmat.view(x0.shape), xmat.view(x0.shape), dmask)
+            )
+    @staticmethod
+    def backward(ctx, dz, *args):
+        # assert dz.is_contiguous()
+        dz = maybe_align(dz.contiguous(), 16)  # this happens!
+        dx = maybe_align(args[0].contiguous(), 16) if ctx.prenorm else None
+        x, x0, dmask, gamma, mu, rsigma, rowscale, colscale = ctx.saved_tensors
+        # x0 is None if colscale is None
+        dropout_p = ctx.dropout_p
+        has_residual = ctx.has_residual
+        dx0mat, dresidualmat, dgamma, dbeta, *rest = _dropout_add_layer_norm_backward(
+            dz,
+            dx,
+            x,
+            x0,
+            dmask,
+            mu,
+            rsigma,
+            gamma,
+            rowscale,
+            colscale,
+            dropout_p,
+            has_residual,
+            ctx.is_rms_norm,
+        )
+        dx0 = dx0mat.view(x.shape)
+        dresidual = dresidualmat.view(x.shape) if dresidualmat is not None else None
+        dcolscale = rest[0] if colscale is not None else None
+        return (
+            dx0,
+            dresidual,
+            dgamma,
+            dbeta if ctx.has_beta else None,
+            None,
+            dcolscale,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+class DropoutAddLayerNormSubsetFn(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x0,
+        residual,
+        gamma,
+        beta,
+        colscale,
+        x0_subset,
+        out_subset,
+        dropout_p,
+        epsilon,
+        rowscale_const,
+        out_numrows,
+        residual_in_fp32=False,
+        prenorm=False,
+        is_rms_norm=False,
+        return_dmask=False,
+    ):
+        x0 = maybe_align(x0.contiguous(), 16)
+        residual = maybe_align(residual.contiguous(), 16) if residual is not None else None
+        gamma = maybe_align(gamma.contiguous(), 16)
+        beta = maybe_align(beta.contiguous(), 16) if beta is not None else None
+        colscale = maybe_align(colscale.contiguous(), 16) if colscale is not None else None
+        zmat, xmat, dmask, mu, rsigma = _dropout_add_layer_norm_subset_forward(
+            x0,
+            residual,
+            gamma,
+            beta,
+            colscale,
+            x0_subset,
+            out_subset,
+            dropout_p,
+            epsilon,
+            rowscale_const,
+            out_numrows,
+            residual_in_fp32,
+            is_rms_norm,
+        )
+        # Only need to save x0 if we need to compute gradient wrt colscale
+        x0_saved = x0 if colscale is not None else None
+        x_shape = (-1, *x0.shape[1:])
+        ctx.save_for_backward(
+            xmat.view(x_shape), x0_saved, dmask, gamma, mu, rsigma, colscale, x0_subset, out_subset
+        )
+        ctx.prenorm = prenorm
+        ctx.dropout_p = dropout_p
+        ctx.rowscale_const = rowscale_const
+        ctx.x0_numrows = x0.shape[:-1].numel()
+        ctx.has_residual = residual is not None
+        ctx.is_rms_norm = is_rms_norm
+        ctx.has_beta = beta is not None
+        z_shape = (-1, *x0.shape[1:])
+        if not return_dmask:
+            return zmat.view(z_shape) if not prenorm else (zmat.view(z_shape), xmat.view(x0.shape))
+        else:
+            z = zmat.view(z_shape)
+            dmask = (
+                dmask.view(x0.shape)
+                if dropout_p > 0.0
+                else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device)
+            )
+            ctx.mark_non_differentiable(dmask)
+            return (z, dmask) if not prenorm else (z, xmat.view(x_shape), dmask)
+    @staticmethod
+    def backward(ctx, dz, *args):
+        # assert dz.is_contiguous()
+        dz = maybe_align(dz.contiguous(), 16)  # this happens!
+        dx = maybe_align(args[0].contiguous(), 16) if ctx.prenorm else None
+        x, x0, dmask, gamma, mu, rsigma, colscale, x0_subset, out_subset = ctx.saved_tensors
+        # x0 is None if colscale is None
+        dropout_p = ctx.dropout_p
+        has_residual = ctx.has_residual
+        dx0mat, dresidualmat, dgamma, dbeta, *rest = _dropout_add_layer_norm_subset_backward(
+            dz,
+            dx,
+            x,
+            x0,
+            dmask,
+            mu,
+            rsigma,
+            gamma,
+            colscale,
+            x0_subset,
+            out_subset,
+            dropout_p,
+            ctx.rowscale_const,
+            ctx.x0_numrows,
+            has_residual,
+            ctx.is_rms_norm,
+        )
+        dx0 = dx0mat.view(-1, *x.shape[1:])
+        dresidual = dresidualmat.view(x.shape) if dresidualmat is not None else None
+        dcolscale = rest[0] if colscale is not None else None
+        return (
+            dx0,
+            dresidual,
+            dgamma,
+            dbeta if ctx.has_beta else None,
+            dcolscale,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+class DropoutAddLayerNormParallelResidualFn(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x0,
+        x1,
+        residual,
+        gamma0,
+        beta0,
+        gamma1,
+        beta1,
+        dropout_p,
+        epsilon,
+        residual_in_fp32=False,
+        prenorm=False,
+        is_rms_norm=False,
+        return_dmask=False,
+    ):
+        x0 = maybe_align(x0.contiguous(), 16)
+        x1 = maybe_align(x1.contiguous(), 16) if x1 is not None else None
+        residual = maybe_align(residual.contiguous(), 16) if residual is not None else None
+        gamma0 = maybe_align(gamma0.contiguous(), 16)
+        beta0 = maybe_align(beta0.contiguous(), 16) if beta0 is not None else None
+        gamma1 = maybe_align(gamma1.contiguous(), 16) if gamma1 is not None else None
+        beta1 = maybe_align(beta1.contiguous(), 16) if beta1 is not None else None
+        (
+            z0mat,
+            z1mat,
+            xmat,
+            dmask0,
+            dmask1,
+            mu,
+            rsigma,
+        ) = _dropout_add_layer_norm_parallel_residual_forward(
+            x0,
+            x1,
+            residual,
+            gamma0,
+            beta0,
+            gamma1,
+            beta1,
+            dropout_p,
+            epsilon,
+            residual_in_fp32,
+            is_rms_norm,
+        )
+        ctx.save_for_backward(xmat.view(x0.shape), dmask0, dmask1, gamma0, gamma1, mu, rsigma)
+        ctx.prenorm = prenorm
+        ctx.dropout_p = dropout_p
+        ctx.has_x1 = x1 is not None
+        ctx.has_residual = residual is not None
+        ctx.is_rms_norm = is_rms_norm
+        ctx.has_beta = beta0 is not None
+        z = (z0mat.view(x0.shape), z1mat.view(x0.shape) if z1mat is not None else None)
+        if not return_dmask:
+            return z if not prenorm else (*z, xmat.view(x0.shape))
+        else:
+            dmask0 = (
+                dmask0.view(x0.shape)
+                if dropout_p > 0.0
+                else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device)
+            )
+            dmask1 = (
+                dmask1.view(x0.shape)
+                if dropout_p > 0.0 and x1 is not None
+                else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device)
+            )
+            ctx.mark_non_differentiable(dmask0)
+            ctx.mark_non_differentiable(dmask1)
+            return (
+                (*z, dmask0, dmask1) if not prenorm else (*z, xmat.view(x0.shape), dmask0, dmask1)
+            )
+    @staticmethod
+    def backward(ctx, dz0, dz1, *args):
+        dz0 = maybe_align(dz0.contiguous(), 16)  # this happens!
+        dz1 = maybe_align(dz1.contiguous(), 16) if dz1 is not None else None
+        dx = maybe_align(args[0].contiguous(), 16) if ctx.prenorm else None
+        x, dmask0, dmask1, gamma0, gamma1, mu, rsigma = ctx.saved_tensors
+        dropout_p = ctx.dropout_p
+        has_x1 = ctx.has_x1
+        has_residual = ctx.has_residual
+        (
+            dx0mat,
+            dx1mat,
+            dresidualmat,
+            dgamma0,
+            dbeta0,
+            dgamma1,
+            dbeta1,
+        ) = _dropout_add_layer_norm_parallel_residual_backward(
+            dz0,
+            dz1,
+            dx,
+            x,
+            dmask0,
+            dmask1,
+            mu,
+            rsigma,
+            gamma0,
+            gamma1,
+            dropout_p,
+            has_x1,
+            has_residual,
+            ctx.is_rms_norm,
+        )
+        dx0 = dx0mat.view(x.shape)
+        dx1 = dx1mat.view(x.shape) if dx1mat is not None else None
+        dresidual = dresidualmat.view(x.shape) if dresidualmat is not None else None
+        return (
+            dx0,
+            dx1,
+            dresidual,
+            dgamma0,
+            dbeta0 if ctx.has_beta else None,
+            dgamma1,
+            dbeta1 if ctx.has_beta else None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+def layer_norm(x, weight, bias, epsilon):
+    return DropoutAddLayerNormFn.apply(x, None, weight, bias, None, None, 0.0, epsilon, False)
+def dropout_add_layer_norm(
+    x0,
+    residual,
+    weight,
+    bias,
+    dropout_p,
+    epsilon,
+    rowscale=None,
+    layerscale=None,
+    prenorm=False,
+    residual_in_fp32=False,
+    return_dropout_mask=False,
+):
+    """residual_in_fp32 only has an effect if residual is None.
+    Otherwise residual dtype is residual.dtype.
+    """
+    return DropoutAddLayerNormFn.apply(
+        x0,
+        residual,
+        weight,
+        bias,
+        rowscale,
+        layerscale,
+        dropout_p,
+        epsilon,
+        residual_in_fp32,
+        prenorm,
+        False,
+        return_dropout_mask,
+    )
+def dropout_add_layer_norm_subset(
+    x0,
+    residual,
+    weight,
+    bias,
+    dropout_p,
+    epsilon,
+    layerscale=None,
+    x0_subset=None,
+    out_subset=None,
+    rowscale_const=1.0,
+    out_numrows=0,
+    prenorm=False,
+    residual_in_fp32=False,
+    return_dropout_mask=False,
+):
+    """residual_in_fp32 only has an effect if residual is None.
+    Otherwise residual dtype is residual.dtype.
+    """
+    return DropoutAddLayerNormSubsetFn.apply(
+        x0,
+        residual,
+        weight,
+        bias,
+        layerscale,
+        x0_subset,
+        out_subset,
+        dropout_p,
+        epsilon,
+        rowscale_const,
+        out_numrows,
+        residual_in_fp32,
+        prenorm,
+        False,
+        return_dropout_mask,
+    )
+def dropout_add_layer_norm_parallel_residual(
+    x0,
+    x1,
+    residual,
+    weight0,
+    bias0,
+    weight1,
+    bias1,
+    dropout_p,
+    epsilon,
+    prenorm=False,
+    residual_in_fp32=False,
+    return_dropout_mask=False,
+):
+    """residual_in_fp32 only has an effect if residual is None.
+    Otherwise residual dtype is residual.dtype.
+    """
+    return DropoutAddLayerNormParallelResidualFn.apply(
+        x0,
+        x1,
+        residual,
+        weight0,
+        bias0,
+        weight1,
+        bias1,
+        dropout_p,
+        epsilon,
+        residual_in_fp32,
+        prenorm,
+        False,
+        return_dropout_mask,
+    )
+class DropoutAddLayerNorm(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        prenorm=False,
+        p=0.0,
+        eps=1e-5,
+        residual_in_fp32=False,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.prenorm = prenorm
+        self.p = p
+        self.eps = eps
+        self.residual_in_fp32 = residual_in_fp32
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.bias = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.reset_parameters()
+    def reset_parameters(self):
+        init.ones_(self.weight)
+        init.zeros_(self.bias)
+    def forward(self, x0, residual=None):
+        return dropout_add_layer_norm(
+            x0,
+            residual,
+            self.weight,
+            self.bias,
+            self.p if self.training else 0.0,
+            self.eps,
+            prenorm=self.prenorm,
+            residual_in_fp32=self.residual_in_fp32,
+        )
+def rms_norm(x, weight, epsilon):
+    return DropoutAddLayerNormFn.apply(
+        x, None, weight, None, None, None, 0.0, epsilon, False, False, True
+    )
+class FusedRMSNorm(torch.nn.Module):
+    def __init__(self, size: int, dim: int = -1, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(size))
+        self.dim = dim
+        self.reset_parameters()
+    def reset_parameters(self):
+        init.ones_(self.weight)
+    def forward(self, x):
+        return rms_norm(x, self.weight, self.eps)
+class RMSNorm(torch.nn.Module):
+    """Root Mean Square Layer Normalization.
+    Derived from https://github.com/bzhangGo/rmsnorm/blob/master/rmsnorm_torch.py. BSD 3-Clause License:
+    https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE.
+    """
+    def __init__(self, size: int, dim: int = -1, eps: float = 1e-5) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(size))
+        self.eps = eps
+        self.dim = dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # NOTE: the original RMSNorm paper implementation is not equivalent
+        norm_x = torch.mean(x * x, dim=self.dim, keepdim=True)
+        x_normed = x * torch.rsqrt(norm_x + self.eps)
+        return self.weight * x_normed
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
+# Copyright (c) 2023, Tri Dao.
+import math
+from typing import Optional, Tuple
+import rotary_emb
+import torch
+from einops import rearrange, repeat
+class ApplyRotaryEmb(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, cos, sin, interleaved=False, inplace=False,future_token = 0):
+        """
+            x: (batch_size, seqlen, nheads, headdim)
+            cos, sin: (seqlen, rotary_dim / 2)
+            interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
+                of 1st half and 2nd half (GPT-NeoX style).
+        rotary_dim must be <= headdim
+        Apply rotary embedding to the first rotary_dim of x.
+        """
+        batch, seqlen, nheads, headdim = x.shape
+        rotary_seqlen, rotary_dim = cos.shape
+        rotary_dim *= 2
+        # print('谁纸盘仲裁',x.shape,cos.shape)
+        # 谁纸盘仲裁 torch.Size([224, 96, 12, 64]) torch.Size([1, 32])
+        # 谁纸盘仲裁 2049 2048
+        assert rotary_dim <= headdim
+        # print(seqlen,rotary_seqlen)
+        assert seqlen <= rotary_seqlen
+        assert sin.shape == (rotary_seqlen, rotary_dim // 2)
+        x_ro = x[..., :rotary_dim]
+        x1, x2 = x_ro.chunk(2, dim=-1) if not interleaved else (x_ro[..., ::2], x_ro[..., 1::2])
+        out = torch.empty_like(x) if not inplace else x
+        out_ro = out[..., :rotary_dim]
+        if inplace:
+            o1, o2 = x1, x2
+        else:
+            o1, o2 = (
+                out_ro.chunk(2, dim=-1)
+                if not interleaved
+                else (out_ro[..., ::2], out_ro[..., 1::2])
+            )
+        rotary_emb.apply_rotary(
+            x1,
+            x2,
+            rearrange(cos[:seqlen], "s d -> s 1 d"),
+            rearrange(sin[:seqlen], "s d -> s 1 d"),
+            o1,
+            o2,
+            False,
+        )
+        if not inplace and rotary_dim < headdim:
+            out[..., rotary_dim:].copy_(x[..., rotary_dim:])
+        ctx.save_for_backward(cos, sin)
+        ctx.interleaved = interleaved
+        ctx.inplace = inplace
+        return out if not inplace else x
+    @staticmethod
+    def backward(ctx, do):
+        cos, sin = ctx.saved_tensors
+        _, seqlen, _, headdim = do.shape
+        rotary_dim = cos.shape[-1]
+        rotary_dim *= 2
+        inplace = ctx.inplace
+        do_ro = do[..., :rotary_dim]
+        do1, do2 = (
+            do_ro.chunk(2, dim=-1) if not ctx.interleaved else (do_ro[..., ::2], do_ro[..., 1::2])
+        )
+        dx = torch.empty_like(do) if not inplace else do
+        if inplace:
+            dx1, dx2 = do1, do2
+        else:
+            dx_ro = dx[..., :rotary_dim]
+            dx1, dx2 = (
+                dx_ro.chunk(2, dim=-1)
+                if not ctx.interleaved
+                else (dx_ro[..., ::2], dx_ro[..., 1::2])
+            )
+        rotary_emb.apply_rotary(
+            do1,
+            do2,
+            rearrange(cos[:seqlen], "s d -> s 1 d"),
+            rearrange(sin[:seqlen], "s d -> s 1 d"),
+            dx1,
+            dx2,
+            True,
+        )
+        if not inplace and rotary_dim < headdim:
+            dx[..., rotary_dim:].copy_(do[..., rotary_dim:])
+        return dx, None, None, None, None
+apply_rotary_emb_func = ApplyRotaryEmb.apply

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da44e17cefff6bb6b59af0cb6164a51e7eeda2dd625925cb11743e74eae8e812
+size 72538452

model_config.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from typing import List
+from transformers import PretrainedConfig
+class YingLongConfig(PretrainedConfig):
+    model_type = "yinglong"
+    # keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        # input_token_len: int = 1,
+        # hidden_size: int = 1024,
+        # intermediate_size: int = 2048,
+        # output_token_lens: List[int] = [1, 8, 32, 64],
+        # num_hidden_layers: int = 8,
+        # num_attention_heads: int = 8,
+        # hidden_act: str = "silu",
+        # use_cache: bool = True,
+        # rope_theta: int = 10000,
+        # attention_dropout: float = 0.0,
+        # initializer_range: float = 0.02,
+        # max_position_embeddings: int = 10000,
+        #####
+        bias = False,
+        condense_ratio = 1,
+        haar_trans = True,
+        haar_trans_inv = True,
+        haar_trans_norm = 'backward',
+        half_diff = False,
+        intermediate_size = 1024,
+        n_embd = 256,
+        n_head = 16,
+        n_layer = 6,
+        n_query_groups = 4,
+        norm_eps = 1e-5,
+        org = 'Alibaba',
+        patch_size = 32,
+        rope_base = 10000,
+        rotary_percentage = 1.0,
+        shared_attention_norm = False,
+        unet = True,
+        _mlp_class = "LLaMAMLP",
+        _norm_class="FusedRMSNorm",
+        *args,
+        **kwargs,
+    ):
+        # self.input_token_len = input_token_len
+        # self.hidden_size = hidden_size
+        # self.intermediate_size = intermediate_size
+        # self.num_hidden_layers = num_hidden_layers
+        # self.num_attention_heads = num_attention_heads
+        # self.hidden_act = hidden_act
+        # self.output_token_lens = output_token_lens;
+        # self.use_cache = use_cache
+        # self.rope_theta = rope_theta
+        # self.attention_dropout = attention_dropout
+        # self.initializer_range = initializer_range
+        # self.max_position_embeddings = max_position_embeddings
+        self.org = 'Alibaba'
+        self.patch_size = patch_size
+        self.unet = unet
+        self.n_embd = n_embd
+        self.intermediate_size = intermediate_size
+        self.n_head = n_head
+        self.n_layer = n_layer
+        self.n_query_groups = n_query_groups
+        self.norm_eps = norm_eps
+        self.bias = bias
+        self.shared_attention_norm = shared_attention_norm
+        self.condense_ratio = condense_ratio
+        self.rope_base = rope_base
+        self.rotary_percentage = rotary_percentage
+        self.haar_trans = haar_trans
+        self.haar_trans_inv = haar_trans_inv
+        self.haar_trans_norm = haar_trans_norm
+        self.half_diff = half_diff
+        self._norm_class = _norm_class
+        self._mlp_class = _mlp_class
+        assert self.n_embd % self.n_head == 0
+        assert self.n_head % self.n_query_groups == 0
+        self.head_size = self.n_embd //  self.n_head
+        self.rope_n_elem = int(self.rotary_percentage * self.head_size)
+        self.rope_condense_ratio = self.condense_ratio
+        super().__init__(
+            **kwargs,
+        )