Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +6 -0
configuration_qwen2.py +2 -2
modeling_beacon.py +53 -7
modeling_qwen2.py +41 -330
modeling_utils.py +493 -10

README.md CHANGED Viewed

@@ -16,6 +16,12 @@ pipeline_tag: text-generation
 - **Low-Cost**
   - it is light-weight and can be efficiently trained with roughly 1B tokens.
 # Usage
 ```python

 - **Low-Cost**
   - it is light-weight and can be efficiently trained with roughly 1B tokens.
+# Environment
+```
+pip install transformers
+pip install flash-attn --no-build-isolation
+```
 # Usage
 ```python

configuration_qwen2.py CHANGED Viewed

@@ -115,8 +115,8 @@ class Qwen2Config(PretrainedConfig):
         rope_scaling=None,
         max_window_layers=28,
         attention_dropout=0.0,
-        beacon_window=2048,
-        beacon_stride=2048,
         beacon_attn="full-coverage",
         beacon_ratio=[2,4,8,16,32],
         beacon_ratio_mix="step-random",

         rope_scaling=None,
         max_window_layers=28,
         attention_dropout=0.0,
+        beacon_window=1024,
+        beacon_stride=1024,
         beacon_attn="full-coverage",
         beacon_ratio=[2,4,8,16,32],
         beacon_ratio_mix="step-random",

modeling_beacon.py CHANGED Viewed

@@ -90,6 +90,10 @@ class Memory(torch.nn.Module):
         self.all_attention_mask = None
         self.all_labels = None
         # the raw activations of recent tokens
         self.raw_activations = [(None, None) for _ in range(self.config.num_hidden_layers)]
         # the attention sink activations
@@ -147,7 +151,7 @@ class Memory(torch.nn.Module):
             raw_memory_size += self.raw_activations[0][0].shape[self.k_seq_dim]
         return sink_memory_size, beacon_memory_size, raw_memory_size
-    def prepare(self, input_ids, attention_mask, labels):
         """
         Prepare inputs for the model. These inputs belong to the same sequence.
         """
@@ -179,6 +183,19 @@ class Memory(torch.nn.Module):
             else:
                 self.all_labels = torch.cat([self.all_labels, labels], dim=1)
             assert self.all_input_ids.shape[1] == self.all_labels.shape[1], f"Found inconsistent all_input_ids {self.all_input_ids.shape} and all_labels {self.all_labels.shape}!"
     def set_compression_ratio(self, start_idx, end_idx):
         """Choose a condensing ratio from self.config.beacon_ratio"""
@@ -399,10 +416,27 @@ class Memory(torch.nn.Module):
         # In the last window, we do not need to append beacons because they will not be used at all
         if self.training and end_idx == self.all_sequence_length:
             next_start_idx = start_idx
             raw_size_to_cache = -1
             beacon_size = 0
-            compression_ratio = 1
             is_full_window = False
         else:
             #============================================#
@@ -511,9 +545,9 @@ class Memory(torch.nn.Module):
                 # update the reminder
                 self._interleave_remainder = (input_len + self._interleave_remainder) % compression_ratio
-            # NOTE: skip computing loss in the very first window because the beacon tokens will be used in the next window
-            if self.training and self._step_idx == 0 and not (self.config.beacon_pos == 'interleave' and self.config.beacon_attn == 'full-coverage'):
-                labels[:] = -100
         # t2 = time.time()
@@ -607,12 +641,15 @@ class Memory(torch.nn.Module):
         self._end_idx = end_idx
         self._step_idx += 1
         # print(f"beacon_size:        {beacon_size}")
         # print(f"raw_size_to_cache:  {raw_size_to_cache}")
         # print(f"input_ids:          {input_ids}")
         # print(f"beacon_indices:     {beacon_indices}")
         # print(f"position_ids:       {position_ids}")
-        # print(f"attention_mask:\n{attention_mask}")
         # x = input()
         # if x == "s":
         #     return
@@ -627,6 +664,16 @@ class Memory(torch.nn.Module):
             # NOTE: the past_key_values are incrementally returned (only the new keys and values are returned)
             previous_raw_key, previous_raw_value = self.raw_activations[layer_idx]
             if self.beacon_activations[layer_idx][0] is None and self.config.beacon_sink_size > 0:
                 # save the sink activations
                 # NOTE: we do not slice the key/value activations, which may cause duplication when beacon_ratio=-1 for the first window, but it's okay
@@ -696,7 +743,6 @@ class Memory(torch.nn.Module):
             # NOTE: we must use dict to override values, otherwise trainer cannot find loss
             model_outputs["loss"] = loss
             model_outputs["batch_loss"] = batch_loss
-            model_outputs["valid_token_num"] = self._valid_token_num
         # override last_hidden_states (used in generation)
         beacon_size = self._all_beacon_sizes[-1]

         self.all_attention_mask = None
         self.all_labels = None
+        # NOTE: will be reset in prepare()
+        self.beacon_skip_first = None
+        self.beacon_skip_last = None
         # the raw activations of recent tokens
         self.raw_activations = [(None, None) for _ in range(self.config.num_hidden_layers)]
         # the attention sink activations
             raw_memory_size += self.raw_activations[0][0].shape[self.k_seq_dim]
         return sink_memory_size, beacon_memory_size, raw_memory_size
+    def prepare(self, input_ids, attention_mask, labels, skip_first=None, skip_last=None):
         """
         Prepare inputs for the model. These inputs belong to the same sequence.
         """
             else:
                 self.all_labels = torch.cat([self.all_labels, labels], dim=1)
             assert self.all_input_ids.shape[1] == self.all_labels.shape[1], f"Found inconsistent all_input_ids {self.all_input_ids.shape} and all_labels {self.all_labels.shape}!"
+        # how many tokens to skip at the beginning of the sequence? (They will be packed in a single chunk and processed by the model, after which their activations will be cached in sink_activations.)
+        if skip_first is not None:
+            assert self.config.beacon_parallel_window == 1, f"Make sure the parallel window is set to 1 when using beacon_skip!"
+            assert self.config.beacon_window == self.config.beacon_stride, f"Make sure the beacon_window equals to beacon_stride when using beacon_skip."
+            assert self.config.beacon_sink_size == 0, f"Make sure the beacon_sink_size is set to 0 when using beacon_skip!"
+        # stop compression after how many tokens
+        if skip_last is not None:
+            skip_first = skip_first if skip_first is not None else 0
+            assert (skip_last - skip_first) % self.config.beacon_window == 0, f"skip_last ({skip_last}) - skip_first ({skip_first}) = {skip_last - skip_first} is not divisible by window size {self.config.beacon_window}"
+            assert self.config.beacon_sink_size == 0, "Make sure the beacon_sink_size is zero when using skip_last!"
+        self.beacon_skip_first = skip_first
+        self.beacon_skip_last = skip_last
     def set_compression_ratio(self, start_idx, end_idx):
         """Choose a condensing ratio from self.config.beacon_ratio"""
         # In the last window, we do not need to append beacons because they will not be used at all
         if self.training and end_idx == self.all_sequence_length:
             next_start_idx = start_idx
+            is_full_window = False
+            raw_size_to_cache = -1
+            beacon_size = 0
+            compression_ratio = -1
+        elif self._step_idx == 0 and self.beacon_skip_first is not None:
+            end_idx = start_idx + self.beacon_skip_first
+            assert end_idx < self.all_sequence_length
+            next_start_idx = end_idx
+            is_full_window = True
             raw_size_to_cache = -1
             beacon_size = 0
+            compression_ratio = -1
+        elif self.beacon_skip_last is not None and start_idx >= self.beacon_skip_last:
+            end_idx = min(start_idx + self.config.beacon_window, self.all_sequence_length)
+            next_start_idx = end_idx
             is_full_window = False
+            raw_size_to_cache = -1
+            beacon_size = 0
+            compression_ratio = -1
         else:
             #============================================#
                 # update the reminder
                 self._interleave_remainder = (input_len + self._interleave_remainder) % compression_ratio
+        # NOTE: skip computing loss in the very first window because the beacon tokens will be used in the next window
+        if self.training and self._step_idx == 0 and not (self.config.beacon_pos == 'interleave' and self.config.beacon_attn == 'full-coverage'):
+            labels[:] = -100
         # t2 = time.time()
         self._end_idx = end_idx
         self._step_idx += 1
+        # print(f"start_idx:          {start_idx}")
+        # print(f"next_start_idx:     {next_start_idx}")
         # print(f"beacon_size:        {beacon_size}")
         # print(f"raw_size_to_cache:  {raw_size_to_cache}")
+        # print(f"interleave_remainder:{self._interleave_remainder}")
         # print(f"input_ids:          {input_ids}")
         # print(f"beacon_indices:     {beacon_indices}")
         # print(f"position_ids:       {position_ids}")
+        # print(f"attention_mask:\n{attention_mask == 0}")
         # x = input()
         # if x == "s":
         #     return
             # NOTE: the past_key_values are incrementally returned (only the new keys and values are returned)
             previous_raw_key, previous_raw_value = self.raw_activations[layer_idx]
+            if self.beacon_skip_first is not None and self.sink_activations[layer_idx][0] is None:
+                assert key.shape[self.k_seq_dim] == self.beacon_skip_first
+                assert value.shape[self.k_seq_dim] == self.beacon_skip_first
+                self.sink_activations[layer_idx] = [
+                    key,
+                    value,
+                ]
+                # NOTE: no need to update raw activations and beacon activations as all activations are kept as sink activations
+                continue
             if self.beacon_activations[layer_idx][0] is None and self.config.beacon_sink_size > 0:
                 # save the sink activations
                 # NOTE: we do not slice the key/value activations, which may cause duplication when beacon_ratio=-1 for the first window, but it's okay
             # NOTE: we must use dict to override values, otherwise trainer cannot find loss
             model_outputs["loss"] = loss
             model_outputs["batch_loss"] = batch_loss
         # override last_hidden_states (used in generation)
         beacon_size = self._all_beacon_sizes[-1]

modeling_qwen2.py CHANGED Viewed

@@ -30,8 +30,7 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
@@ -53,7 +52,7 @@ if is_flash_attn_2_available():
 from .configuration_qwen2 import Qwen2Config
 from .modeling_beacon import Memory
-from .modeling_utils import optional_grad_ctx, compute_loss, BeaconModelOutput
 logger = logging.get_logger(__name__)
@@ -99,183 +98,6 @@ class Qwen2RMSNorm(nn.Module):
         return self.weight * hidden_states.to(input_dtype)
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=32768, base=10000, device=None):
-        super().__init__()
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-    def forward(self, q, k, position_ids):
-        seq_len = max(position_ids.max().item() + 1, k.shape[2])
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=k.device, dtype=k.dtype)
-        # batch_size, 1, key_len, head_dim
-        k_cos = self.cos_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
-        k_sin = self.sin_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
-        q_cos = k_cos[..., -q.shape[2]:, :]
-        q_sin = k_sin[..., -q.shape[2]:, :]
-        q_embed = (q * q_cos) + (rotate_half(q) * q_sin)
-        k_embed = (k * k_cos) + (rotate_half(k) * k_sin)
-        return q_embed, k_embed
-class Qwen2LinearScalingRotaryEmbedding(Qwen2RotaryEmbedding):
-    """Qwen2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-    def __init__(self, dim, max_position_embeddings=32768, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-class Qwen2DynamicNTKScalingRotaryEmbedding(Qwen2RotaryEmbedding):
-    """Qwen2RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-    def __init__(self, dim, max_position_embeddings=32768, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq, persistent=False)
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-class Qwen2YarnRotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, beta_slow=2, beta_fast=128):
-        super().__init__()
-        self.base = base
-        self.dim = dim
-        self.scaling_factor = scaling_factor
-        self.beta_slow = beta_slow
-        self.beta_fast = beta_fast
-        self.max_position_embeddings = max_position_embeddings
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=device, dtype=torch.get_default_dtype()
-        )
-    def _get_factor(self, device, dtype):
-        # the dimension whose index is smaller than fast_dim rotates more than beta_fast
-        fast_dim = self.dim / 2 * (math.log(self.max_position_embeddings / (2 * math.pi * self.beta_fast)) / math.log(self.base))
-        fast_dim = max(math.floor(fast_dim), 0)
-        # the dimension whose index is bigger than slow_dim rotates less than beta_slow
-        slow_dim = self.dim / 2 * (math.log(self.max_position_embeddings / (2 * math.pi * self.beta_slow)) / math.log(self.base))
-        slow_dim = min(math.ceil(slow_dim), self.dim - 1)
-        if fast_dim == slow_dim:
-            slow_dim += 0.001
-        # NOTE: very important to use full precision here so that the factor is correct
-        dim_arange = torch.arange(0, self.dim // 2, device=device, dtype=torch.float32)
-        dim_factor = (dim_arange - fast_dim) / (slow_dim - fast_dim)
-        dim_factor = torch.clamp(dim_factor, 0, 1)
-        # align with the paper notation
-        return (1 - dim_factor)
-    def _get_temperature(self):
-        if self.scaling_factor <= 1:
-            return 1.0
-        return 0.07 * math.log(self.scaling_factor) + 1.0
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        dim_arange = torch.arange(0, self.dim, 2, device=device) / self.dim
-        # dim / 2
-        freq = self.base ** dim_arange
-        theta = 1 / freq
-        interleave_theta = theta / self.scaling_factor
-        factor = self._get_factor(device, dtype)
-        yarn_theta = factor * theta + (1 - factor) * interleave_theta
-        self.register_buffer("inv_freq", yarn_theta, persistent=False)
-        t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype)
-        freqs = torch.outer(t, self.inv_freq)
-        emb = torch.cat((freqs, freqs), dim=-1)
-        # get attention temperature
-        temperature = self._get_temperature()
-        self.register_buffer("cos_cached", (emb.cos() * temperature).to(dtype), persistent=False)
-        self.register_buffer("sin_cached", (emb.sin() * temperature).to(dtype), persistent=False)
-        self.max_seq_len_cached = seq_len
-    def forward(self, q, k, position_ids):
-        seq_len = max(position_ids.max().item() + 1, k.shape[2])
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self.scaling_factor = seq_len / self.max_position_embeddings
-            self._set_cos_sin_cache(seq_len=seq_len, device=k.device, dtype=k.dtype)
-        k_cos = self.cos_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
-        k_sin = self.sin_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
-        q_cos = k_cos[..., -q.shape[2]:, :]
-        q_sin = k_sin[..., -q.shape[2]:, :]
-        q_embed = (q * q_cos) + (rotate_half(q) * q_sin)
-        k_embed = (k * k_cos) + (rotate_half(k) * k_sin)
-        return q_embed, k_embed
 # Copied from transformers.models.mistral.modeling_mistral.Qwen2MLP with Qwen2->Qwen2
 class Qwen2MLP(nn.Module):
     def __init__(self, config):
@@ -288,54 +110,8 @@ class Qwen2MLP(nn.Module):
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
-        if "mlp" in config.beacon_param:
-            self.beacon_up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-            self.beacon_up_proj.weight.data.zero_()
-            self.beacon_up_proj._is_hf_initialized = True
-            self.beacon_down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-            self.beacon_down_proj.weight.data.zero_()
-            self.beacon_down_proj._is_hf_initialized = True
-    def _init_beacon_proj(self, missing_keys):
-        """Initialize the beacon projection weight with that of the ordinal projection."""
-        if "mlp" in self.config.beacon_param:
-            if is_deepspeed_zero3_enabled():
-                # FIXME: after deepspeed initialization, some weights becomes non-zero
-                # For Mistral, there are rows that are full of zeros
-                # For Mistral, there are values bigger than 1e29...
-                import deepspeed
-                params = [self.up_proj.weight, self.down_proj.weight, self.beacon_up_proj.weight, self.beacon_down_proj.weight]
-                with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
-                    if (self.beacon_up_proj.weight.sum(-1) == 0).any() or (self.beacon_up_proj.weight > 1e29).any():
-                        self.beacon_up_proj.weight.data[:] = self.up_proj.weight.data
-                        self.beacon_down_proj.weight.data[:] = self.down_proj.weight.data
-            else:
-                if any("beacon_up_proj" in missing_key for missing_key in missing_keys):
-                    # only copy the value in-place, without tieing the weight
-                    self.beacon_up_proj.weight.data[:] = self.up_proj.weight.data
-                    self.beacon_down_proj.weight.data[:] = self.down_proj.weight.data
-    def forward(self, x, beacon_size, beacon_indices):
-        if "mlp" in self.config.beacon_param:
-            # NOTE: when beacon_pos == "interleave", the beacon_indices points to all beacon tokens in the current window (cached activations + input_ids), so we shall slice out the part corresponding to the input_ids
-            if beacon_size > 0:
-                cur_beacon_indices = beacon_indices[-x.shape[1]:]
-                ordinal_hidden_states = x[:, cur_beacon_indices == 0]
-                beacon_hidden_states = x[:, cur_beacon_indices == 1]
-                ordinal_down_proj = self.down_proj(self.act_fn(self.gate_proj(ordinal_hidden_states)) * self.up_proj(ordinal_hidden_states))
-                beacon_down_proj = self.beacon_down_proj(self.act_fn(self.gate_proj(beacon_hidden_states)) * self.beacon_up_proj(beacon_hidden_states))
-                down_proj = beacon_down_proj.new_ones(x.shape)
-                down_proj[:, beacon_indices == 0] = ordinal_down_proj
-                down_proj[:, beacon_indices == 1] = beacon_down_proj
-            else:
-                down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         return down_proj
@@ -386,7 +162,7 @@ class Qwen2Attention(nn.Module):
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self._init_rope()
         # NOTE: add extra parameters for beacon tokens
         # skip post initialization to speed up loading
@@ -408,54 +184,6 @@ class Qwen2Attention(nn.Module):
             self.beacon_o_proj.weight.data.zero_()
             self.beacon_o_proj._is_hf_initialized = True
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = Qwen2RotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = Qwen2LinearScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == "dynamic":
-                self.rotary_emb = Qwen2DynamicNTKScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == "yarn":
-                self.rotary_emb = Qwen2YarnRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == "yarn-t":
-                self.rotary_emb = Qwen2YarnDynamicTemperatureRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == "yarn-t-logn":
-                self.rotary_emb = Qwen2YarnDynamicTemperatureLogNRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
     def _init_beacon_proj(self, missing_keys):
         """Initialize the beacon projection weight with that of the ordinal projection."""
         beacon_param = self.config.beacon_param
@@ -538,44 +266,37 @@ class Qwen2Attention(nn.Module):
             # NOTE: when beacon_pos == "interleave", the beacon_indices points to all beacon tokens in the current window (cached activations + input_ids), so we shall slice out the part corresponding to the input_ids
             cur_beacon_indices = beacon_indices[-hidden_states.shape[1]:]
-            ordinal_hidden_states = hidden_states[:, cur_beacon_indices == 0]
-            beacon_hidden_states = hidden_states[:, cur_beacon_indices == 1]
             if "q" in self.config.beacon_param:
-                ordinal_query_states = self.q_proj(ordinal_hidden_states)
-                beacon_query_states = self.beacon_q_proj(beacon_hidden_states)
-                query_states = beacon_query_states.new_zeros((ordinal_query_states.shape[0], cur_beacon_indices.shape[0], ordinal_query_states.shape[2]))
-                query_states[:, cur_beacon_indices == 0] = ordinal_query_states
-                query_states[:, cur_beacon_indices == 1] = beacon_query_states
-                # NOTE: replicate hidden states for beacon tokens in case of parallel windows
                 if (cur_beacon_indices == 2).any():
-                    query_states[:, cur_beacon_indices == 2] = beacon_query_states[:, :(cur_beacon_indices == 2).sum()]
             else:
                 query_states = self.q_proj(hidden_states)
             if "k" in self.config.beacon_param:
-                ordinal_key_states = self.k_proj(ordinal_hidden_states)
-                beacon_key_states = self.beacon_k_proj(beacon_hidden_states)
-                key_states = beacon_key_states.new_zeros((ordinal_key_states.shape[0], cur_beacon_indices.shape[0], ordinal_key_states.shape[2]))
-                key_states[:, cur_beacon_indices == 0] = ordinal_key_states
-                key_states[:, cur_beacon_indices == 1] = beacon_key_states
-                # NOTE: replicate hidden states for beacon tokens in case of parallel windows
                 if (cur_beacon_indices == 2).any():
-                    key_states[:, cur_beacon_indices == 2] = beacon_key_states[:, :(cur_beacon_indices == 2).sum()]
             else:
                 key_states = self.k_proj(hidden_states)
             if "v" in self.config.beacon_param:
-                ordinal_value_states = self.v_proj(ordinal_hidden_states)
-                beacon_value_states = self.beacon_v_proj(beacon_hidden_states)
-                value_states = beacon_value_states.new_zeros((ordinal_value_states.shape[0], cur_beacon_indices.shape[0], ordinal_value_states.shape[2]))
-                value_states[:, cur_beacon_indices == 0] = ordinal_value_states
-                value_states[:, cur_beacon_indices == 1] = beacon_value_states
-                # NOTE: replicate hidden states for beacon tokens in case of parallel windows
                 if (cur_beacon_indices == 2).any():
-                    value_states[:, cur_beacon_indices == 2] = beacon_value_states[:, :(cur_beacon_indices == 2).sum()]
             else:
                 value_states = self.v_proj(hidden_states)
@@ -592,14 +313,9 @@ class Qwen2Attention(nn.Module):
             cur_beacon_indices = beacon_indices[-attn_output.shape[1]:]
             if "o" in self.config.beacon_param:
-                ordinal_attn_output = self.o_proj(attn_output[:, cur_beacon_indices == 0])
-                beacon_attn_output = self.beacon_o_proj(attn_output[:, cur_beacon_indices == 1])
-                attn_output = beacon_attn_output.new_zeros(attn_output.shape)
-                attn_output[:, cur_beacon_indices == 0] = ordinal_attn_output
-                attn_output[:, cur_beacon_indices == 1] = beacon_attn_output
-                # NOTE: replicate hidden states for beacon tokens in case of parallel windows
-                # if (cur_beacon_indices == 2).any():
-                #     attn_output[:, cur_beacon_indices == 2] = beacon_attn_output[:, :(cur_beacon_indices == 2).sum()]
             else:
                 attn_output = self.o_proj(attn_output)
         else:
@@ -1036,10 +752,6 @@ class Qwen2DecoderLayer(nn.Module):
                 (see `past_key_values`).
             past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
         """
-        # NOTE: get beacon_size in case the mlp is included in beacon_param
-        past_key, past_value, beacon_size, beacon_indices = past_key_value
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
@@ -1058,7 +770,7 @@ class Qwen2DecoderLayer(nn.Module):
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states, beacon_size, beacon_indices)
         hidden_states = residual + hidden_states
         outputs = (hidden_states,)
@@ -1426,7 +1138,6 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
         # initialize weights of possible q,k,v,o,mlp
         for layer in model.model.layers:
             layer.self_attn._init_beacon_proj(missing_keys)
-            layer.mlp._init_beacon_proj(missing_keys)
         return model
@@ -1438,12 +1149,11 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
-        shift_labels: Optional[bool] = True,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BeaconModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1474,19 +1184,19 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
         loss = None
         batch_loss = None
-        valid_token_num = None
         if labels is not None:
-            loss, batch_loss, valid_token_num = compute_loss(logits, labels, shift=shift_labels)
         if not return_dict:
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
-        return BeaconModelOutput(
             loss=loss,
             batch_loss=batch_loss,
-            valid_token_num=valid_token_num,
             logits=logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
@@ -1504,6 +1214,8 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
         # t1 = time.time()
@@ -1511,12 +1223,13 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
         self.memory.prepare(
             input_ids=input_ids,
             attention_mask=attention_mask,
-            labels=labels
         )
         # t2 = time.time()
-        # after the first window, one token at a time
         while not self.memory.finish:
             # t3 = time.time()
@@ -1536,8 +1249,6 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
                 labels=labels,
-                # NOTE: the labels have been shifted so that all tokens in the window have the proper loss
-                shift_labels=False,
             )
             # t5 = time.time()
@@ -1549,7 +1260,7 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
             if labels is not None:
                 # update loss
-                self.memory.update_loss(outputs.batch_loss, outputs.valid_token_num)
             # t7 = time.time()
@@ -1567,7 +1278,7 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
         # input()
         return outputs
     def forward(self, **kwargs):
         """Forward computation over a batch of sequences.
         """

 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
 from .configuration_qwen2 import Qwen2Config
 from .modeling_beacon import Memory
+from .modeling_utils import optional_grad_ctx, compute_loss, get_rope, ModelOutput
 logger = logging.get_logger(__name__)
         return self.weight * hidden_states.to(input_dtype)
 # Copied from transformers.models.mistral.modeling_mistral.Qwen2MLP with Qwen2->Qwen2
 class Qwen2MLP(nn.Module):
     def __init__(self, config):
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         return down_proj
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = get_rope(self.head_dim, config.rope_theta, config.max_position_embeddings, getattr(config, "rope_scaling", None))
         # NOTE: add extra parameters for beacon tokens
         # skip post initialization to speed up loading
             self.beacon_o_proj.weight.data.zero_()
             self.beacon_o_proj._is_hf_initialized = True
     def _init_beacon_proj(self, missing_keys):
         """Initialize the beacon projection weight with that of the ordinal projection."""
         beacon_param = self.config.beacon_param
             # NOTE: when beacon_pos == "interleave", the beacon_indices points to all beacon tokens in the current window (cached activations + input_ids), so we shall slice out the part corresponding to the input_ids
             cur_beacon_indices = beacon_indices[-hidden_states.shape[1]:]
+            # NOTE: there is slight redundant computation because ordinal tokens should never be projected by beacon matrices, but we are doing this for efficiency
             if "q" in self.config.beacon_param:
+                ordinal_query_states = self.q_proj(hidden_states)
+                beacon_query_states = self.beacon_q_proj(hidden_states)
+                query_states = torch.where((cur_beacon_indices == 0)[:, None], ordinal_query_states, beacon_query_states)
                 if (cur_beacon_indices == 2).any():
+                    # beacon_indices == 2 means the beacon token is used to replicate the ones in previous window for parallel encoding
+                    # we should slice out all beacon tokens then copy them to the replicate beacon tokens
+                    query_states[:, cur_beacon_indices == 2] = beacon_query_states[:, cur_beacon_indices == 1][:, :(cur_beacon_indices == 2).sum()]
             else:
                 query_states = self.q_proj(hidden_states)
             if "k" in self.config.beacon_param:
+                ordinal_key_states = self.k_proj(hidden_states)
+                beacon_key_states = self.beacon_k_proj(hidden_states)
+                key_states = torch.where((cur_beacon_indices == 0)[:, None], ordinal_key_states, beacon_key_states)
                 if (cur_beacon_indices == 2).any():
+                    # beacon_indices == 2 means the beacon token is used to replicate the ones in previous window for parallel encoding
+                    # we should slice out all beacon tokens then copy them to the replicate beacon tokens
+                    key_states[:, cur_beacon_indices == 2] = beacon_key_states[:, cur_beacon_indices == 1][:, :(cur_beacon_indices == 2).sum()]
             else:
                 key_states = self.k_proj(hidden_states)
             if "v" in self.config.beacon_param:
+                ordinal_value_states = self.v_proj(hidden_states)
+                beacon_value_states = self.beacon_v_proj(hidden_states)
+                value_states = torch.where((cur_beacon_indices == 0)[:, None], ordinal_value_states, beacon_value_states)
                 if (cur_beacon_indices == 2).any():
+                    # beacon_indices == 2 means the beacon token is used to replicate the ones in previous window for parallel encoding
+                    # we should slice out all beacon tokens then copy them to the replicate beacon tokens
+                    value_states[:, cur_beacon_indices == 2] = beacon_value_states[:, cur_beacon_indices == 1][:, :(cur_beacon_indices == 2).sum()]
             else:
                 value_states = self.v_proj(hidden_states)
             cur_beacon_indices = beacon_indices[-attn_output.shape[1]:]
             if "o" in self.config.beacon_param:
+                ordinal_attn_output = self.o_proj(attn_output)
+                beacon_attn_output = self.beacon_o_proj(attn_output)
+                attn_output = torch.where((cur_beacon_indices == 0)[:, None], ordinal_attn_output, beacon_attn_output)
             else:
                 attn_output = self.o_proj(attn_output)
         else:
                 (see `past_key_values`).
             past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
         """
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
         outputs = (hidden_states,)
         # initialize weights of possible q,k,v,o,mlp
         for layer in model.model.layers:
             layer.self_attn._init_beacon_proj(missing_keys)
         return model
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         loss = None
         batch_loss = None
+        token_loss = None
         if labels is not None:
+            loss, batch_loss, token_loss = compute_loss(logits, labels, shift=False)
         if not return_dict:
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
+        return ModelOutput(
             loss=loss,
             batch_loss=batch_loss,
+            token_loss=token_loss,
             logits=logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        beacon_skip_first: Optional[int] = None,
+        beacon_skip_last: Optional[int] = None,
     ):
         # t1 = time.time()
         self.memory.prepare(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            labels=labels,
+            skip_first=beacon_skip_first,
+            skip_last=beacon_skip_last,
         )
         # t2 = time.time()
         while not self.memory.finish:
             # t3 = time.time()
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
                 labels=labels,
             )
             # t5 = time.time()
             if labels is not None:
                 # update loss
+                self.memory.update_loss(outputs.batch_loss, (labels != -100).sum(-1))
             # t7 = time.time()
         # input()
         return outputs
     def forward(self, **kwargs):
         """Forward computation over a batch of sequences.
         """

modeling_utils.py CHANGED Viewed

@@ -29,14 +29,28 @@ def move_to_device(data, device):
     else:
         return data
 def compute_loss(logits, labels, shift=False):
     """
     Returns:
         token_loss: batch_size, seq_length
     """
     if shift:
-        logits = logits[:, :-1, :].contiguous()
-        labels = labels[:, 1:].contiguous()
     labels = labels.to(logits.device)
     batch_size = logits.shape[0]
@@ -63,7 +77,7 @@ def compute_loss(logits, labels, shift=False):
     if (valid_token_num == 0).any():
         batch_loss = batch_loss.masked_fill(valid_token_num == 0, 0.)
-    return loss, batch_loss, valid_token_num
 @torch.no_grad()
@@ -89,14 +103,15 @@ def evaluate_perplexity(model, dataloader, accelerator:Optional[Accelerator]=Non
         output = model(**x)
         # NOTE: we need the loss for each element in the batch for accurate computation, because the number of valid tokens may differ among elements
         if hasattr(output, "batch_loss"):
             # output from our model has batch_loss by default
             batch_loss = output.batch_loss
-            valid_token_num = output.valid_token_num
         else:
             # output from other models does not
-            loss, batch_loss, valid_token_num = compute_loss(output.logits, x["labels"], shift=True)
         index = index.tolist()
         batch_loss = batch_loss.tolist()
@@ -194,14 +209,15 @@ def evaluate_nll(model, dataloader, accelerator:Optional[Accelerator]=None):
         output = model(**x)
         # NOTE: we need the loss for each element in the batch for accurate computation, because the number of valid tokens may differ among elements
         if hasattr(output, "batch_loss"):
             # output from our model has batch_loss by default
             batch_loss = output.batch_loss
-            valid_token_num = output.valid_token_num
         else:
             # output from other models does not
-            loss, batch_loss, valid_token_num = compute_loss(output.logits, x["labels"], shift=True)
         if accelerator is not None and accelerator.num_processes > 1:
             # num_device * batch_size
@@ -216,13 +232,480 @@ def evaluate_nll(model, dataloader, accelerator:Optional[Accelerator]=None):
     return all_loss
 @dataclass
-class BeaconModelOutput(BaseModelOutputWithPast):
     loss: Optional[torch.FloatTensor] = None
     batch_loss: Optional[torch.FloatTensor] = None
-    valid_token_num: Optional[torch.LongTensor] = None
     logits: torch.FloatTensor = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None

     else:
         return data
+def get_shifted_labels(input_ids):
+    if isinstance(input_ids, torch.Tensor):
+        labels = input_ids.clone()
+        labels = torch.cat([labels[:, 1:], labels.new_zeros((input_ids.shape[0], 1)) - 100], dim=-1)
+    elif isinstance(input_ids, list) and isinstance(input_ids[0], int):
+        labels = input_ids.copy()
+        labels = labels[1:] + [-100]
+    elif isinstance(input_ids, list) and isinstance(input_ids[0], list):
+        labels = input_ids.copy()
+        for i, label in enumerate(labels):
+            labels[i] = labels[i][1:] + [-100]
+    else:
+        raise NotImplementedError
+    return labels
 def compute_loss(logits, labels, shift=False):
     """
     Returns:
         token_loss: batch_size, seq_length
     """
     if shift:
+        labels = get_shifted_labels(labels)
     labels = labels.to(logits.device)
     batch_size = logits.shape[0]
     if (valid_token_num == 0).any():
         batch_loss = batch_loss.masked_fill(valid_token_num == 0, 0.)
+    return loss, batch_loss, token_loss
 @torch.no_grad()
         output = model(**x)
+        valid_token_num = (x["labels"] != -100).sum(-1)
         # NOTE: we need the loss for each element in the batch for accurate computation, because the number of valid tokens may differ among elements
         if hasattr(output, "batch_loss"):
             # output from our model has batch_loss by default
             batch_loss = output.batch_loss
         else:
             # output from other models does not
+            loss, batch_loss, token_loss = compute_loss(output.logits, x["labels"], shift=True)
         index = index.tolist()
         batch_loss = batch_loss.tolist()
         output = model(**x)
+        valid_token_num = (x["labels"] != -100).sum()
         # NOTE: we need the loss for each element in the batch for accurate computation, because the number of valid tokens may differ among elements
         if hasattr(output, "batch_loss"):
             # output from our model has batch_loss by default
             batch_loss = output.batch_loss
         else:
             # output from other models does not
+            loss, batch_loss, token_loss = compute_loss(output.logits, x["labels"], shift=True)
         if accelerator is not None and accelerator.num_processes > 1:
             # num_device * batch_size
     return all_loss
 @dataclass
+class ModelOutput(BaseModelOutputWithPast):
     loss: Optional[torch.FloatTensor] = None
     batch_loss: Optional[torch.FloatTensor] = None
+    token_loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
+########## Various RoPE Scaling Methods Below (wrap the encoding process within the module for convenience) ##########
+def get_rope(head_dim, base, max_position_embeddings, rope_scaling=None):
+    """
+    Get rope module. {native, linear scaling, dynamic ntk scaling, yarn scaling, llama3 scaling}
+    """
+    if rope_scaling is None:
+        rope = RotaryEmbedding(
+            dim=head_dim,
+            base=base,
+            max_position_embeddings=max_position_embeddings,
+        )
+    else:
+        scaling_type = rope_scaling["type"]
+        scaling_factor = rope_scaling["factor"]
+        if scaling_type == "linear":
+            rope = LinearScalingRotaryEmbedding(
+                dim=head_dim,
+                base=base,
+                max_position_embeddings=max_position_embeddings,
+                scaling_factor=scaling_factor,
+            )
+        elif scaling_type == "dynamic":
+            rope = DynamicNTKScalingRotaryEmbedding(
+                dim=head_dim,
+                base=base,
+                max_position_embeddings=max_position_embeddings,
+                scaling_factor=scaling_factor,
+            )
+        elif scaling_type == "yarn":
+            rope = YarnRotaryEmbedding(
+                dim=head_dim,
+                base=base,
+                max_position_embeddings=max_position_embeddings,
+                scaling_factor=scaling_factor,
+            )
+        elif scaling_type == "yarn-t":
+            rope = YarnDynamicTemperatureRotaryEmbedding(
+                dim=head_dim,
+                base=base,
+                max_position_embeddings=max_position_embeddings,
+                scaling_factor=scaling_factor,
+            )
+        elif scaling_type == "yarn-t-logn":
+            rope = YarnDynamicTemperatureLogNRotaryEmbedding(
+                dim=head_dim,
+                base=base,
+                max_position_embeddings=max_position_embeddings,
+                scaling_factor=scaling_factor,
+            )
+        elif scaling_type == "llama3":
+            rope = Llama3RotaryEmbedding(
+                dim=head_dim,
+                base=base,
+                max_position_embeddings=max_position_embeddings,
+                scaling_factor=scaling_factor,
+                original_max_position_embeddings=rope_scaling.get("original_max_position_embeddings", 8192),
+                low_freq_factor=rope_scaling.get("low_freq_factor", 1),
+                high_freq_factor=rope_scaling.get("high_freq_factor", 4),
+            )
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    return rope
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=32768, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.float32).to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.float32)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos(), persistent=False)
+        self.register_buffer("sin_cached", emb.sin(), persistent=False)
+    def forward(self, q, k, position_ids):
+        seq_len = max(position_ids.max().item() + 1, k.shape[2])
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=k.device, dtype=k.dtype)
+        # batch_size, 1, key_len, head_dim
+        k_cos = self.cos_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
+        k_sin = self.sin_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
+        q_cos = k_cos[..., -q.shape[2]:, :]
+        q_sin = k_sin[..., -q.shape[2]:, :]
+        q_embed = (q * q_cos) + (rotate_half(q) * q_sin)
+        k_embed = (k * k_cos) + (rotate_half(k) * k_sin)
+        return q_embed, k_embed
+class LinearScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=32768, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.float32)
+        t = t / self.scaling_factor
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=32768, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.float32).to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class YarnRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, beta_slow=2, beta_fast=128):
+        super().__init__()
+        self.base = base
+        self.dim = dim
+        self.scaling_factor = scaling_factor
+        self.beta_slow = beta_slow
+        self.beta_fast = beta_fast
+        self.max_position_embeddings = max_position_embeddings
+        self._set_cos_sin_cache(
+            seq_len=math.ceil(max_position_embeddings * scaling_factor), device=device, dtype=torch.get_default_dtype()
+        )
+    def _get_factor(self):
+        # the dimension whose index is smaller than fast_dim rotates more than beta_fast
+        fast_dim = self.dim / 2 * (math.log(self.max_position_embeddings / (2 * math.pi * self.beta_fast)) / math.log(self.base))
+        fast_dim = max(math.floor(fast_dim), 0)
+        # the dimension whose index is bigger than slow_dim rotates less than beta_slow
+        slow_dim = self.dim / 2 * (math.log(self.max_position_embeddings / (2 * math.pi * self.beta_slow)) / math.log(self.base))
+        slow_dim = min(math.ceil(slow_dim), self.dim - 1)
+        if fast_dim == slow_dim:
+            slow_dim += 0.001
+        # NOTE: very important to use full precision here so that the factor is correct
+        dim_arange = torch.arange(0, self.dim // 2, dtype=torch.float32)
+        dim_factor = (dim_arange - fast_dim) / (slow_dim - fast_dim)
+        dim_factor = torch.clamp(dim_factor, 0, 1)
+        # align with the paper notation
+        return (1 - dim_factor)
+    def _get_temperature(self):
+        if self.scaling_factor <= 1:
+            return 1.0
+        return 0.07 * math.log(self.scaling_factor) + 1.0
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        dim_arange = torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim
+        # dim / 2
+        freq = self.base ** dim_arange
+        theta = 1 / freq
+        interleave_theta = theta / self.scaling_factor
+        factor = self._get_factor().to(device)
+        yarn_theta = factor * theta + (1 - factor) * interleave_theta
+        self.register_buffer("inv_freq", yarn_theta, persistent=False)
+        t = torch.arange(seq_len, device=device, dtype=torch.float32)
+        freqs = torch.outer(t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # get attention temperature
+        temperature = self._get_temperature()
+        self.register_buffer("cos_cached", emb.cos() * temperature, persistent=False)
+        self.register_buffer("sin_cached", emb.sin() * temperature, persistent=False)
+        self.max_seq_len_cached = seq_len
+    def forward(self, q, k, position_ids):
+        seq_len = max(position_ids.max().item() + 1, k.shape[2])
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self.scaling_factor = seq_len / self.max_position_embeddings
+            self._set_cos_sin_cache(seq_len=seq_len, device=k.device, dtype=k.dtype)
+        k_cos = self.cos_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
+        k_sin = self.sin_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
+        q_cos = k_cos[..., -q.shape[2]:, :]
+        q_sin = k_sin[..., -q.shape[2]:, :]
+        q_embed = (q * q_cos) + (rotate_half(q) * q_sin)
+        k_embed = (k * k_cos) + (rotate_half(k) * k_sin)
+        return q_embed, k_embed
+class YarnDynamicTemperatureRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, beta_slow=2, beta_fast=128):
+        super().__init__()
+        self.base = base
+        self.dim = dim
+        self.scaling_factor = scaling_factor
+        self.beta_slow = beta_slow
+        self.beta_fast = beta_fast
+        self.max_position_embeddings = max_position_embeddings
+        self._set_cos_sin_cache(
+            seq_len=math.ceil(max_position_embeddings * scaling_factor), device=device, dtype=torch.get_default_dtype()
+        )
+    def _get_factor(self):
+        # the dimension whose index is smaller than fast_dim rotates more than beta_fast
+        fast_dim = self.dim / 2 * (math.log(self.max_position_embeddings / (2 * math.pi * self.beta_fast)) / math.log(self.base))
+        fast_dim = max(math.floor(fast_dim), 0)
+        # the dimension whose index is bigger than slow_dim rotates less than beta_slow
+        slow_dim = self.dim / 2 * (math.log(self.max_position_embeddings / (2 * math.pi * self.beta_slow)) / math.log(self.base))
+        slow_dim = min(math.ceil(slow_dim), self.dim - 1)
+        if fast_dim == slow_dim:
+            slow_dim += 0.001
+        # NOTE: very important to use full precision here so that the factor is correct
+        dim_arange = torch.arange(0, self.dim // 2, dtype=torch.float32)
+        dim_factor = (dim_arange - fast_dim) / (slow_dim - fast_dim)
+        dim_factor = torch.clamp(dim_factor, 0, 1)
+        # align with the paper notation
+        return (1 - dim_factor)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        dim_arange = torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim
+        # dim / 2
+        freq = self.base ** dim_arange
+        theta = 1 / freq
+        interleave_theta = theta / self.scaling_factor
+        factor = self._get_factor().to(device)
+        yarn_theta = factor * theta + (1 - factor) * interleave_theta
+        self.register_buffer("inv_freq", yarn_theta, persistent=False)
+        positions = torch.arange(seq_len, device=device, dtype=torch.float32)
+        freqs = torch.outer(positions, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # NOTE: get attention temperature that will be applied on the query vector
+        # temperature = torch.log(positions + 1) / math.log(self.max_position_embeddings)
+        temperature = (0.07 * torch.log((positions + 1) / self.max_position_embeddings) + 1) ** 2
+        temperature[:self.max_position_embeddings] = 1
+        self.register_buffer("temperature", temperature.unsqueeze(1), persistent=False)
+        self.register_buffer("cos_cached", emb.cos(), persistent=False)
+        self.register_buffer("sin_cached", emb.sin(), persistent=False)
+        self.max_seq_len_cached = seq_len
+    def forward(self, q, k, position_ids):
+        seq_len = max(position_ids.max().item() + 1, k.shape[2])
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self.scaling_factor = seq_len / self.max_position_embeddings
+            self._set_cos_sin_cache(seq_len=seq_len, device=k.device, dtype=k.dtype)
+        # batch_size, 1, key_len, head_dim
+        k_cos = self.cos_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
+        k_sin = self.sin_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
+        q_cos = k_cos[..., -q.shape[2]:, :]
+        q_sin = k_sin[..., -q.shape[2]:, :]
+        q_position_ids = position_ids[:, -q.shape[2]:]
+        temperature = self.temperature[q_position_ids].to(dtype=k.dtype).unsqueeze(1)
+        q_cos = q_cos * temperature
+        q_sin = q_sin * temperature
+        q_embed = (q * q_cos) + (rotate_half(q) * q_sin)
+        k_embed = (k * k_cos) + (rotate_half(k) * k_sin)
+        return q_embed, k_embed
+class YarnDynamicTemperatureLogNRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, beta_slow=2, beta_fast=128):
+        super().__init__()
+        self.base = base
+        self.dim = dim
+        self.scaling_factor = scaling_factor
+        self.beta_slow = beta_slow
+        self.beta_fast = beta_fast
+        self.max_position_embeddings = max_position_embeddings
+        self._set_cos_sin_cache(
+            seq_len=math.ceil(max_position_embeddings * scaling_factor), device=device, dtype=torch.get_default_dtype()
+        )
+    def _get_factor(self):
+        # the dimension whose index is smaller than fast_dim rotates more than beta_fast
+        fast_dim = self.dim / 2 * (math.log(self.max_position_embeddings / (2 * math.pi * self.beta_fast)) / math.log(self.base))
+        fast_dim = max(math.floor(fast_dim), 0)
+        # the dimension whose index is bigger than slow_dim rotates less than beta_slow
+        slow_dim = self.dim / 2 * (math.log(self.max_position_embeddings / (2 * math.pi * self.beta_slow)) / math.log(self.base))
+        slow_dim = min(math.ceil(slow_dim), self.dim - 1)
+        if fast_dim == slow_dim:
+            slow_dim += 0.001
+        # NOTE: very important to use full precision here so that the factor is correct
+        dim_arange = torch.arange(0, self.dim // 2, dtype=torch.float32)
+        dim_factor = (dim_arange - fast_dim) / (slow_dim - fast_dim)
+        dim_factor = torch.clamp(dim_factor, 0, 1)
+        # align with the paper notation
+        return (1 - dim_factor)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        dim_arange = torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim
+        # dim / 2
+        freq = self.base ** dim_arange
+        theta = 1 / freq
+        interleave_theta = theta / self.scaling_factor
+        factor = self._get_factor().to(device)
+        yarn_theta = factor * theta + (1 - factor) * interleave_theta
+        self.register_buffer("inv_freq", yarn_theta, persistent=False)
+        positions = torch.arange(seq_len, device=device, dtype=torch.float32)
+        freqs = torch.outer(positions, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # NOTE: get attention temperature that will be applied on the query vector
+        temperature = torch.log(positions + 1) / math.log(self.max_position_embeddings)
+        # temperature = (0.07 * torch.log((positions + 1) / self.max_position_embeddings) + 1) ** 2
+        temperature[:self.max_position_embeddings] = 1
+        self.register_buffer("temperature", temperature.unsqueeze(1), persistent=False)
+        self.register_buffer("cos_cached", emb.cos(), persistent=False)
+        self.register_buffer("sin_cached", emb.sin(), persistent=False)
+        self.max_seq_len_cached = seq_len
+    def forward(self, q, k, position_ids):
+        seq_len = max(position_ids.max().item() + 1, k.shape[2])
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self.scaling_factor = seq_len / self.max_position_embeddings
+            self._set_cos_sin_cache(seq_len=seq_len, device=k.device, dtype=k.dtype)
+        # batch_size, 1, key_len, head_dim
+        k_cos = self.cos_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
+        k_sin = self.sin_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
+        q_cos = k_cos[..., -q.shape[2]:, :]
+        q_sin = k_sin[..., -q.shape[2]:, :]
+        q_position_ids = position_ids[:, -q.shape[2]:]
+        temperature = self.temperature[q_position_ids].to(dtype=k.dtype).unsqueeze(1)
+        q_cos = q_cos * temperature
+        q_sin = q_sin * temperature
+        q_embed = (q * q_cos) + (rotate_half(q) * q_sin)
+        k_embed = (k * k_cos) + (rotate_half(k) * k_sin)
+        return q_embed, k_embed
+class Llama3RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=8192, base=10000, device=None, scaling_factor=1.0, original_max_position_embeddings=8192, low_freq_factor=1, high_freq_factor=4):
+        super().__init__()
+        self.base = base
+        self.dim = dim
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.max_position_embeddings = max(max_position_embeddings, int(original_max_position_embeddings * scaling_factor))
+        self.low_freq_factor = low_freq_factor
+        self.high_freq_factor = high_freq_factor
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.float32).to(device) / self.dim))
+        low_freq_wavelen = self.original_max_position_embeddings / low_freq_factor
+        high_freq_wavelen = self.original_max_position_embeddings / high_freq_factor
+        new_freqs = []
+        for freq in inv_freq:
+            wavelen = 2 * math.pi / freq
+            if wavelen < high_freq_wavelen:
+                new_freqs.append(freq)
+            elif wavelen > low_freq_wavelen:
+                new_freqs.append(freq / scaling_factor)
+            else:
+                assert low_freq_wavelen != high_freq_wavelen
+                smooth = (self.original_max_position_embeddings / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+                new_freqs.append((1 - smooth) * freq / scaling_factor + smooth * freq)
+        inv_freq = torch.tensor(new_freqs, dtype=inv_freq.dtype, device=inv_freq.device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._set_cos_sin_cache(seq_len=self.max_position_embeddings, device=device)
+    def _set_cos_sin_cache(self, seq_len, device):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.float32)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos(), persistent=False)
+        self.register_buffer("sin_cached", emb.sin(), persistent=False)
+    def forward(self, q, k, position_ids):
+        seq_len = max(position_ids.max().item() + 1, k.shape[2])
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=k.device)
+        k_cos = self.cos_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
+        k_sin = self.sin_cached[position_ids].to(dtype=k.dtype).unsqueeze(1)
+        q_cos = k_cos[..., -q.shape[2]:, :]
+        q_sin = k_sin[..., -q.shape[2]:, :]
+        q_embed = (q * q_cos) + (rotate_half(q) * q_sin)
+        k_embed = (k * k_cos) + (rotate_half(k) * k_sin)
+        return q_embed, k_embed