added support for position interpolation

Files changed (3) hide show

README.md +29 -0
configuration_btlm.py +36 -0
modeling_btlm.py +14 -3

README.md CHANGED Viewed

@@ -162,6 +162,35 @@ Ensure the following muP parameters are passed in your config, otherwise your mo
 - `mup_output_alpha: <float>`
 - `mup_scale_qk_dot_by_d: true`
 ## Uses and Limitations
 ### Intended Use

 - `mup_output_alpha: <float>`
 - `mup_scale_qk_dot_by_d: true`
+## To extend the context length with Position Interpolation
+### During inference (without fine-tuning):
+It's possible to extend the context length to 2x the training context length without degradation in performance using dynamic linear scaling. Dynamic linear scaling adjusts the slopes of ALiBi with a factor of `input_seq_len/train_seq_len` when `input_seq_len` is larger than `train_seq_len`. Check the details in our paper [Position Interpolation Improves ALiBi Extrapolation](https://arxiv.org/abs/2310.13017). To enable dynamic linear scaling, update `config.json` as follows:
+```json
+  # update `n_positions` with the maximum context length will be
+  # encountered during inference (e.g. 16384 tokens)
+  "n_positions": 16384,
+  # specify `train_seq_len` in `alibi_scaling` parameter
+  "alibi_scaling": {
+    "type": "linear",
+    "train_seq_len": 8192
+  }
+```
+### Using fine-tuning + position interpolation:
+Performing fine-tuning with position interpolation can help achieve greater extrapolation lengths. The scaling factor should be fixed to `finetuning_seq_len/train_seq_len`. To enable fixed linear scaling, update `config.json` as follows:
+```json
+  # update `n_positions` with the fine-tuning context length (e.g. 32768 tokens)
+  "n_positions": 32768,
+  # specify the scaling `factor` in `alibi_scaling` parameter
+  "alibi_scaling": {
+    "type": "linear",
+    "factor": 4.0
+  }
+```
 ## Uses and Limitations
 ### Intended Use

configuration_btlm.py CHANGED Viewed

@@ -84,6 +84,12 @@ class BTLMConfig(PretrainedConfig):
         mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
             Scale attention weights by dividing by hidden_size instead of sqrt(hidden_size). Need to set
             scale_attn_weights to `True` as well.
     Example:
@@ -134,6 +140,7 @@ class BTLMConfig(PretrainedConfig):
         mup_embeddings_scale=1.0,
         mup_output_alpha=1.0,
         mup_scale_qk_dot_by_d=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -162,4 +169,33 @@ class BTLMConfig(PretrainedConfig):
         self.mup_output_alpha = mup_output_alpha
         self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

         mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
             Scale attention weights by dividing by hidden_size instead of sqrt(hidden_size). Need to set
             scale_attn_weights to `True` as well.
+        alibi_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for ALiBi embeddings. Currently only supports linear
+            scaling strategy. Can specify either the scaling `factor` (must be a float greater than 1) for fixed scaling
+            or `train_seq_len` for dynamic scaling on input samples with sequence length > `train_seq_len`. The expected
+            formats are `{"type": strategy name, "factor": scaling factor}` or
+            `{"type": strategy name, "train_seq_len": training sequence length}`.
     Example:
         mup_embeddings_scale=1.0,
         mup_output_alpha=1.0,
         mup_scale_qk_dot_by_d=False,
+        alibi_scaling=None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.mup_output_alpha = mup_output_alpha
         self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d
+        self.alibi_scaling = alibi_scaling
+        self._alibi_scaling_validation()
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+    def _alibi_scaling_validation(self):
+        """
+        Validate the `alibi_scaling` configuration.
+        """
+        if self.alibi_scaling is None:
+            return
+        if not isinstance(self.alibi_scaling, dict) or len(self.alibi_scaling) != 2:
+            raise ValueError(
+                "`alibi_scaling` must be a dictionary with two fields, `type` and `factor` or `type` and `train_seq_len`, "
+                f"got {self.alibi_scaling}"
+            )
+        alibi_scaling_type = self.alibi_scaling.get("type", None)
+        alibi_scaling_factor = self.alibi_scaling.get("factor", None)
+        alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
+        if alibi_scaling_type is None or alibi_scaling_type != "linear":
+            raise ValueError(
+                f"`alibi_scaling`'s type field must be 'linear', got {alibi_scaling_type}"
+            )
+        if alibi_scaling_factor is not None:
+            if not isinstance(alibi_scaling_factor, float) or alibi_scaling_factor <= 1.0:
+                raise ValueError(f"`alibi_scaling`'s factor field must be a float > 1.0, got {alibi_scaling_factor}")
+        if alibi_dynamic_scaling is not None:
+            if not isinstance(alibi_dynamic_scaling, int) or alibi_dynamic_scaling <= 1:
+                raise ValueError(f"`alibi_scaling`'s `train_seq_len` field must be an integer > 1, got {alibi_dynamic_scaling}")

modeling_btlm.py CHANGED Viewed

@@ -63,10 +63,11 @@ class SwiGLUActivation(nn.Module):
 class AlibiPositionEmbeddingLayer(nn.Module):
-    def __init__(self, num_heads):
         super(AlibiPositionEmbeddingLayer, self).__init__()
         self.num_heads = num_heads
         slopes = torch.tensor(AlibiPositionEmbeddingLayer._get_alibi_slopes(num_heads)).unsqueeze(-1)
         self.slopes = nn.parameter.Parameter(slopes, requires_grad=False)
@@ -84,7 +85,17 @@ class AlibiPositionEmbeddingLayer(nn.Module):
         )[None, :]
         relative_position = memory_position - context_position
         relative_position = torch.abs(relative_position).unsqueeze(0).expand(self.num_heads, -1, -1)
-        alibi = (self.slopes * -1.0).unsqueeze(1) * relative_position
         return alibi
     @staticmethod
@@ -766,7 +777,7 @@ class BTLMModel(BTLMPreTrainedModel):
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
         self.relative_pe = (
-            AlibiPositionEmbeddingLayer(config.num_attention_heads)
             if config.position_embedding_type == "alibi"
             else None
         )

 class AlibiPositionEmbeddingLayer(nn.Module):
+    def __init__(self, num_heads, alibi_scaling=None):
         super(AlibiPositionEmbeddingLayer, self).__init__()
         self.num_heads = num_heads
+        self.alibi_scaling = alibi_scaling
         slopes = torch.tensor(AlibiPositionEmbeddingLayer._get_alibi_slopes(num_heads)).unsqueeze(-1)
         self.slopes = nn.parameter.Parameter(slopes, requires_grad=False)
         )[None, :]
         relative_position = memory_position - context_position
         relative_position = torch.abs(relative_position).unsqueeze(0).expand(self.num_heads, -1, -1)
+        if self.alibi_scaling is None:
+            scale = 1.0
+        elif self.alibi_scaling.get("factor") is not None:
+            scale = self.alibi_scaling["factor"]
+        elif relative_position.shape[-1] > self.alibi_scaling["train_seq_len"]:
+            scale = relative_position.shape[-1] / self.alibi_scaling["train_seq_len"]
+        else:
+            scale = 1.0
+        alibi = (self.slopes / -scale).unsqueeze(1) * relative_position
         return alibi
     @staticmethod
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
         self.relative_pe = (
+            AlibiPositionEmbeddingLayer(config.num_attention_heads, config.alibi_scaling)
             if config.position_embedding_type == "alibi"
             else None
         )