gagan3012
/

batch_invariant_kernel

kernel

Model card Files Files and versions

xet

Community

gagan3012 commited on Sep 11

Commit

b1fc84a

verified ·

1 Parent(s): 9eaa1e0

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

flake.lock +168 -0
torch-ext/batch_invariant/__init__.py +168 -0

flake.lock ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1756316789,
+        "narHash": "sha256-DJvw0l+PXeFq963L3sbqAQKjIwGPae+yWpZHraFES28=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "57ea72ac74c89331005c47bb082b28cef653bed8",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
+        "nixpkgs": [
+          "kernel-builder",
+          "hf-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1757060761,
+        "narHash": "sha256-aKGP9jgV6N8aRF7jR3OnYSBmOa6C6u4ULRpvcThgFck=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "08fcbf386981dc0fb7e47679d3ba86d77a33721b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1755963616,
+        "narHash": "sha256-6yD0ww/S8n+U2uPYcJZ3DRURP8Kx036GRpR2uPNZroE=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "73e96df7cff5783f45e21342a75a1540c4eddce4",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "ref": "nixos-unstable-small",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

torch-ext/batch_invariant/__init__.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import torch
 from ._ops import ops
@@ -118,3 +120,169 @@ def mean_batch_invariant(input, dim, keepdim=False, dtype: torch.dtype = None):
         for d in dim:
             n_elems *= input.shape[d]
         return torch.sum(input, dim=dim, keepdim=keepdim, dtype=torch.float32) / n_elems

 import torch
+import torch.nn as nn
+import math
 from ._ops import ops
         for d in dim:
             n_elems *= input.shape[d]
         return torch.sum(input, dim=dim, keepdim=keepdim, dtype=torch.float32) / n_elems
+class BatchInvariantAttention(nn.Module):
+    """
+    Batch invariant multi-head attention implementation.
+    Compatible with transformers library integration.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        # Linear projections
+        self.q_proj = nn.Linear(
+            self.hidden_size, self.num_heads * self.head_dim, bias=False
+        )
+        self.k_proj = nn.Linear(
+            self.hidden_size, self.num_heads * self.head_dim, bias=False
+        )
+        self.v_proj = nn.Linear(
+            self.hidden_size, self.num_heads * self.head_dim, bias=False
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.head_dim, self.hidden_size, bias=False
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        position_ids: torch.Tensor = None,
+        past_key_value=None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: torch.Tensor = None,
+        **kwargs,
+    ):
+        batch_size, seq_len, _ = hidden_states.size()
+        # Project to Q, K, V using batch invariant matrix multiplication
+        query_states = self._batch_invariant_linear(hidden_states, self.q_proj.weight)
+        key_states = self._batch_invariant_linear(hidden_states, self.k_proj.weight)
+        value_states = self._batch_invariant_linear(hidden_states, self.v_proj.weight)
+        # Reshape for multi-head attention
+        query_states = query_states.view(
+            batch_size, seq_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            batch_size, seq_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            batch_size, seq_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        # Compute attention scores
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
+        # Apply attention mask if provided
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        # Apply softmax using batch invariant log_softmax
+        attn_weights_log = log_softmax(attn_weights, dim=-1)
+        attn_weights = torch.exp(attn_weights_log)
+        # Apply attention to values
+        attn_output = torch.matmul(attn_weights, value_states)
+        # Reshape and apply output projection
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, seq_len, self.hidden_size)
+        attn_output = self._batch_invariant_linear(attn_output, self.o_proj.weight)
+        outputs = (attn_output,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        if use_cache:
+            outputs += (past_key_value,)
+        return outputs
+    def _batch_invariant_linear(
+        self, input_tensor: torch.Tensor, weight: torch.Tensor
+    ) -> torch.Tensor:
+        """Apply linear transformation using batch invariant matrix multiplication"""
+        original_shape = input_tensor.shape
+        input_2d = input_tensor.view(-1, original_shape[-1])
+        output_2d = matmul_persistent(input_2d, weight.t())
+        return output_2d.view(*original_shape[:-1], -1)
+class BatchInvariantMLP(nn.Module):
+    """
+    Batch invariant MLP implementation.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = (
+            nn.SiLU()
+        )  # or whatever activation function is specified in config
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Use batch invariant matrix multiplication for projections
+        gate = self._batch_invariant_linear(x, self.gate_proj.weight)
+        up = self._batch_invariant_linear(x, self.up_proj.weight)
+        # Apply activation
+        intermediate = self.act_fn(gate) * up
+        # Down projection
+        output = self._batch_invariant_linear(intermediate, self.down_proj.weight)
+        return output
+    def _batch_invariant_linear(
+        self, input_tensor: torch.Tensor, weight: torch.Tensor
+    ) -> torch.Tensor:
+        """Apply linear transformation using batch invariant matrix multiplication"""
+        original_shape = input_tensor.shape
+        input_2d = input_tensor.view(-1, original_shape[-1])
+        output_2d = matmul_persistent(input_2d, weight.t())
+        return output_2d.view(*original_shape[:-1], -1)
+class BatchInvariantRMSNorm(nn.Module):
+    """
+    Batch invariant RMS normalization implementation.
+    """
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        # Compute mean square using batch invariant mean
+        variance = mean_dim(hidden_states.pow(2), dim=-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+# Export the layer classes
+__all__ += ["BatchInvariantAttention", "BatchInvariantMLP", "BatchInvariantRMSNorm"]