iamwyldecat commited on Jun 28

Commit

f3b99fb

1 Parent(s): d14fd4d

feat(rms-norm): Impl fused RMSNorm

Browse files

Files changed (26) hide show

README.md +1 -0
activation/block_reduce.h +20 -0
activation/{activation_kernels.cu → poly_norm.cu} +9 -20
activation/rms_norm.cu +168 -0
build.toml +3 -1
build/flake.lock +0 -168
build/flake.nix +0 -11
build/torch26-cxx11-rocm62-x86_64-linux/activation/__init__.py +9 -0
build/{torch27-cxx11-rocm63-x86_64-linux/activation/_activation_704692b_dirty.abi3.so → torch26-cxx11-rocm62-x86_64-linux/activation/_activation_d14fd4d_dirty.abi3.so} +2 -2
build/torch26-cxx11-rocm62-x86_64-linux/activation/_ops.py +3 -3
build/torch26-cxx11-rocm62-x86_64-linux/activation/layers.py +17 -3
build/torch26-cxx11-rocm62-x86_64-linux/activation/rms_norm.py +34 -0
build/torch27-cxx11-rocm63-x86_64-linux/activation/__init__.py +9 -0
build/{torch26-cxx11-rocm62-x86_64-linux/activation/_activation_704692b_dirty.abi3.so → torch27-cxx11-rocm63-x86_64-linux/activation/_activation_d14fd4d_dirty.abi3.so} +2 -2
build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/activation/layers.py +17 -3
build/torch27-cxx11-rocm63-x86_64-linux/activation/rms_norm.py +34 -0
tests/conftest.py +1 -1
tests/kernels/{test_activation.py → test_poly_norm.py} +0 -0
tests/kernels/{test_perf.py → test_poly_norm_perf.py} +1 -1
tests/kernels/test_rms_norm.py +72 -0
torch-ext/activation/__init__.py +9 -0
torch-ext/activation/layers.py +17 -3
torch-ext/activation/rms_norm.py +34 -0
torch-ext/torch_binding.cpp +8 -0
torch-ext/torch_binding.h +3 -0

README.md CHANGED Viewed

@@ -9,6 +9,7 @@ Activation is a python package that contains custom CUDA-based activation kernel
 - Currently implemented
   - [PolyNorm](https://arxiv.org/html/2411.03884v1)
 ## Usage

 - Currently implemented
   - [PolyNorm](https://arxiv.org/html/2411.03884v1)
+  - [RMSNorm](https://docs.pytorch.org/docs/stable/generated/torch.nn.RMSNorm.html)
 ## Usage

activation/block_reduce.h ADDED Viewed

	@@ -0,0 +1,20 @@

+namespace motif {
+template <typename acc_t, int BLOCK_SIZE>
+__device__ acc_t _block_reduce_sum(acc_t* shared, const float val, const int d) {
+  // TODO: Optimize with warp-level primitives
+  __syncthreads();
+  shared[threadIdx.x] = threadIdx.x < d ? val : 0.0f;
+  __syncthreads();
+  for (int stride = BLOCK_SIZE / 2; stride > 0; stride /= 2) {
+    if (threadIdx.x < stride) {
+      shared[threadIdx.x] += shared[threadIdx.x + stride];
+    }
+    __syncthreads();
+  }
+  return shared[0];
+}
+} // motif

activation/{activation_kernels.cu → poly_norm.cu} RENAMED Viewed

@@ -9,26 +9,10 @@
 #include "dispatch_utils.h"
 #include "assert_utils.h"
 #include "atomic_utils.h"
 namespace motif {
-template <typename acc_t, int BLOCK_SIZE>
-__device__ acc_t _block_reduce_sum(acc_t* shared, const float val, const int d) {
-  // TODO: Optimize with warp-level primitives
-  __syncthreads();
-  shared[threadIdx.x] = threadIdx.x < d ? val : 0.0f;
-  __syncthreads();
-  for (int stride = BLOCK_SIZE / 2; stride > 0; stride /= 2) {
-    if (threadIdx.x < stride) {
-      shared[threadIdx.x] += shared[threadIdx.x + stride];
-    }
-    __syncthreads();
-  }
-  return shared[0];
-}
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
 __global__ void poly_norm_kernel(
     scalar_t* __restrict__ out,          // [..., d]
@@ -251,7 +235,12 @@ void poly_norm_backward(
     }
   );
-  at::sum_out(bias_grad, output_grad);
-  at::sum_out(weight_grad, temp_weight_grad, {0});
-  bias_grad.resize_({1});
 }

 #include "dispatch_utils.h"
 #include "assert_utils.h"
 #include "atomic_utils.h"
+#include "block_reduce.h"
 namespace motif {
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
 __global__ void poly_norm_kernel(
     scalar_t* __restrict__ out,          // [..., d]
     }
   );
+  if (bias_grad.defined()) {
+    at::sum_out(bias_grad, output_grad);
+    bias_grad.resize_({1});
+  }
+  if (weight_grad.defined()) {
+    at::sum_out(weight_grad, temp_weight_grad, {0});
+  }
 }

activation/rms_norm.cu ADDED Viewed

	@@ -0,0 +1,168 @@

+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/Functions.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cmath>
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+#include "assert_utils.h"
+#include "atomic_utils.h"
+#include "block_reduce.h"
+namespace motif {
+template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void rms_norm_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., d]
+    const scalar_t* __restrict__ weight, // [d]
+    const float eps,
+    const int d
+    ) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t vec_idx = threadIdx.x;
+  acc_t sum_square = 0.0f;
+  for (int64_t idx = vec_idx; idx < d; idx += blockDim.x) {
+    acc_t x = input[token_idx * d + idx];
+    sum_square += x * x;
+  }
+  __shared__ acc_t shared[BLOCK_SIZE];
+  acc_t variance =
+      _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_square, d) / d;
+  acc_t scale = rsqrt(variance + eps);
+  for (int64_t idx = vec_idx; idx < d; idx += blockDim.x) {
+    acc_t x = input[token_idx * d + idx];
+    acc_t w = weight[idx];
+    out[token_idx * d + idx] = w * x * scale;
+  }
+}
+template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void rms_norm_backward_kernel(
+    scalar_t* __restrict__ input_grad,         // [..., d]
+    acc_t* __restrict__ temp_weight_grad,      // [..., d]
+    const scalar_t* __restrict__ output_grad,  // [..., d]
+    const scalar_t* __restrict__ input,        // [..., d]
+    const scalar_t* __restrict__ weight,       // [d]
+    const float eps,
+    const int d
+    ) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t vec_idx = threadIdx.x;
+  acc_t d_sum = 0.0f;
+  acc_t sum_square = 0.0f;
+  for (int64_t idx = vec_idx; idx < d; idx += blockDim.x) {
+    acc_t x = input[token_idx * d + idx];
+    acc_t dy = output_grad[token_idx * d + idx];
+    acc_t w = weight[idx];
+    d_sum += dy * x * w;
+    sum_square += x * x;
+  }
+  __shared__ acc_t shared[BLOCK_SIZE];
+  d_sum = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, d_sum, d);
+  acc_t variance =
+      _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_square, d) / d;
+  acc_t scale = rsqrt(variance + eps);
+  acc_t scale_cubed = scale * scale * scale;
+  acc_t dxx = d_sum * scale_cubed / d;
+  for (int64_t idx = vec_idx; idx < d; idx += blockDim.x) {
+    acc_t x = input[token_idx * d + idx];
+    acc_t dy = output_grad[token_idx * d + idx];
+    acc_t w = weight[idx];
+    input_grad[token_idx * d + idx] =
+      scale * dy * w - dxx * x;
+    if (temp_weight_grad) {
+      temp_weight_grad[token_idx * d + idx] = dy * x * scale;
+    }
+  }
+}
+}  // namespace motif
+void rms_norm(torch::Tensor& out,           // [..., d]
+               const torch::Tensor& input,  // [..., d]
+               const torch::Tensor& weight, // [d]
+               double eps)
+{
+  AssertTensorShapeEqual(input, out, "input", "out");
+  AssertTensorNotNull(weight, "weight");
+  // TODO shape check
+  constexpr int BLOCK_SIZE = 256;
+  int d = input.size(-1);
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+  dim3 block(BLOCK_SIZE);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  MOTIF_DISPATCH_FLOATING_TYPES(
+    input.scalar_type(), "rms_norm_kernel", [&] {
+      motif::rms_norm_kernel<scalar_t, float, BLOCK_SIZE>
+        <<<grid, block, 0, stream>>>(
+          out.data_ptr<scalar_t>(),
+          input.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          eps, d);
+    }
+  );
+}
+void rms_norm_backward(
+  torch::Tensor& input_grad,        // [..., d]
+  torch::Tensor& weight_grad,       // [..., d]
+  const torch::Tensor& output_grad, // [d]
+  const torch::Tensor& input,       // [d]
+  const torch::Tensor& weight,      // [d]
+  double eps) {
+  AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
+  AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
+  AssertTensorNotNull(weight, "weight");
+  // TODO shape check
+  // weight_grad, input_grad can be nullable
+  constexpr int BLOCK_SIZE = 256;
+  int d = input.size(-1);
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+  dim3 block(BLOCK_SIZE);
+  torch::Tensor temp_weight_grad =
+    torch::empty({num_tokens, d},
+    input.options().dtype(torch::kFloat));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  MOTIF_DISPATCH_FLOATING_TYPES(
+    input.scalar_type(), "rms_norm_backward_kernel", [&] {
+      motif::rms_norm_backward_kernel<scalar_t, float, BLOCK_SIZE>
+        <<<grid, block, 0, stream>>>(
+          input_grad.data_ptr<scalar_t>(),
+          temp_weight_grad.data_ptr<float>(),
+          output_grad.data_ptr<scalar_t>(),
+          input.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          eps, d);
+    }
+  );
+  if (weight_grad.defined()) {
+    at::sum_out(weight_grad, temp_weight_grad, {0});
+  }
+}

build.toml CHANGED Viewed

@@ -12,8 +12,10 @@ src = [
 backend = "rocm"
 rocm-archs = [ "gfx90a" ]
 src = [
-  "activation/activation_kernels.cu",
   "activation/cuda_compat.h",
   "activation/dispatch_utils.h",
   "activation/assert_utils.h",
   "activation/atomic_utils.h",

 backend = "rocm"
 rocm-archs = [ "gfx90a" ]
 src = [
+  "activation/poly_norm.cu",
+  "activation/rms_norm.cu",
   "activation/cuda_compat.h",
+  "activation/block_reduce.h",
   "activation/dispatch_utils.h",
   "activation/assert_utils.h",
   "activation/atomic_utils.h",

build/flake.lock DELETED Viewed

@@ -1,168 +0,0 @@
-{
-  "nodes": {
-    "flake-compat": {
-      "locked": {
-        "lastModified": 1747046372,
-        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
-        "type": "github"
-      },
-      "original": {
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "type": "github"
-      }
-    },
-    "flake-compat_2": {
-      "locked": {
-        "lastModified": 1733328505,
-        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
-        "type": "github"
-      },
-      "original": {
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "type": "github"
-      }
-    },
-    "flake-utils": {
-      "inputs": {
-        "systems": "systems"
-      },
-      "locked": {
-        "lastModified": 1731533236,
-        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "flake-utils_2": {
-      "inputs": {
-        "systems": "systems_2"
-      },
-      "locked": {
-        "lastModified": 1731533236,
-        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "hf-nix": {
-      "inputs": {
-        "flake-compat": "flake-compat_2",
-        "flake-utils": "flake-utils_2",
-        "nixpkgs": "nixpkgs"
-      },
-      "locked": {
-        "lastModified": 1747919133,
-        "narHash": "sha256-VvF1naQOvv7yulQ5/cDiaxkNxlh1Y84QMZnderv1szk=",
-        "owner": "huggingface",
-        "repo": "hf-nix",
-        "rev": "9c71e026d6c7c8588ef85a5f7c77f57d598e038c",
-        "type": "github"
-      },
-      "original": {
-        "owner": "huggingface",
-        "repo": "hf-nix",
-        "type": "github"
-      }
-    },
-    "kernel-builder": {
-      "inputs": {
-        "flake-compat": "flake-compat",
-        "flake-utils": "flake-utils",
-        "hf-nix": "hf-nix",
-        "nixpkgs": [
-          "kernel-builder",
-          "hf-nix",
-          "nixpkgs"
-        ]
-      },
-      "locked": {
-        "lastModified": 1748620233,
-        "narHash": "sha256-VULm9HgGXvo3pyfsPy3SOhoqgkuqbGSaSemvzNUbdIU=",
-        "owner": "huggingface",
-        "repo": "kernel-builder",
-        "rev": "da3340e5b3cbb6086600420f4814b033395788d1",
-        "type": "github"
-      },
-      "original": {
-        "owner": "huggingface",
-        "repo": "kernel-builder",
-        "type": "github"
-      }
-    },
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1747820358,
-        "narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
-        "owner": "danieldk",
-        "repo": "nixpkgs",
-        "rev": "d3c1681180717528068082103bf323147de6ab0b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "danieldk",
-        "ref": "cudatoolkit-12.9-kernel-builder",
-        "repo": "nixpkgs",
-        "type": "github"
-      }
-    },
-    "root": {
-      "inputs": {
-        "kernel-builder": "kernel-builder"
-      }
-    },
-    "systems": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
-    },
-    "systems_2": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
-}

build/flake.nix DELETED Viewed

@@ -1,11 +0,0 @@
-{
-  description = "Flake for Torch kernel extension";
-  inputs = {
-    kernel-builder.url = "github:huggingface/kernel-builder";
-  };
-  outputs = { self, kernel-builder, }:
-    kernel-builder.lib.genFlakeOutputs {
-      path = ./.;
-      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
-    };
-}

build/torch26-cxx11-rocm62-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch
 from . import layers
 from ._ops import ops
 from .poly_norm import PolyNormFunction
 def poly_norm(
@@ -14,6 +15,14 @@ def poly_norm(
     return PolyNormFunction.apply(x, weight, bias, eps)
 __all__ = [
     "poly_norm",
     "layers",

 from . import layers
 from ._ops import ops
 from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
 def poly_norm(
     return PolyNormFunction.apply(x, weight, bias, eps)
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return RMSNormFunction.apply(x, weight, eps)
 __all__ = [
     "poly_norm",
     "layers",

build/{torch27-cxx11-rocm63-x86_64-linux/activation/_activation_704692b_dirty.abi3.so → torch26-cxx11-rocm62-x86_64-linux/activation/_activation_d14fd4d_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6fe6163d88e95c0d6847b3fe993cd80de677f89cfde7fc4d5c3ec2d0d96c9de8
-size 2395176

 version https://git-lfs.github.com/spec/v1
+oid sha256:179bfe6bd5484e81b1d8fa6cc3e2596837946a17f0761b0bb2521fd162669046
+size 2656296

build/torch26-cxx11-rocm62-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_704692b_dirty
-ops = torch.ops._activation_704692b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_704692b_dirty::{op_name}"

 import torch
+from . import _activation_d14fd4d_dirty
+ops = torch.ops._activation_d14fd4d_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_d14fd4d_dirty::{op_name}"

build/torch26-cxx11-rocm62-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -2,13 +2,14 @@ import torch
 import torch.nn as nn
 from .poly_norm import PolyNormFunction
 class PolyNorm(nn.Module):
-    def __init__(self, eps=1e-6):
         super().__init__()
-        self.weight = torch.nn.Parameter(torch.ones(3) / 3)
-        self.bias = torch.nn.Parameter(torch.zeros(1))
         self.eps = eps
     def forward(
@@ -16,3 +17,16 @@ class PolyNorm(nn.Module):
         x: torch.Tensor,
     ):
         return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)

 import torch.nn as nn
 from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
 class PolyNorm(nn.Module):
+    def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
         self.eps = eps
     def forward(
         x: torch.Tensor,
     ):
         return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
+        self.eps = eps
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return RMSNormFunction.apply(x, self.weight, self.eps)

build/torch26-cxx11-rocm62-x86_64-linux/activation/rms_norm.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+from ._ops import ops
+# Inherit from Function
+class RMSNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, eps):
+        output = torch.empty_like(input)
+        ops.rms_norm(output, input, weight, eps)
+        return output
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
+        return input_grad, weight_grad, None

build/torch27-cxx11-rocm63-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch
 from . import layers
 from ._ops import ops
 from .poly_norm import PolyNormFunction
 def poly_norm(
@@ -14,6 +15,14 @@ def poly_norm(
     return PolyNormFunction.apply(x, weight, bias, eps)
 __all__ = [
     "poly_norm",
     "layers",

 from . import layers
 from ._ops import ops
 from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
 def poly_norm(
     return PolyNormFunction.apply(x, weight, bias, eps)
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return RMSNormFunction.apply(x, weight, eps)
 __all__ = [
     "poly_norm",
     "layers",

build/{torch26-cxx11-rocm62-x86_64-linux/activation/_activation_704692b_dirty.abi3.so → torch27-cxx11-rocm63-x86_64-linux/activation/_activation_d14fd4d_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:417cf142fb8234b05f7e5b0be321d3a95ceafd7c0b3e5d3469579a52d78ddb1e
-size 2401160

 version https://git-lfs.github.com/spec/v1
+oid sha256:94debfd52e15f782eb9dd328d9311080d803276745e440b176b20a7031299e3f
+size 2642736

build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_704692b_dirty
-ops = torch.ops._activation_704692b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_704692b_dirty::{op_name}"

 import torch
+from . import _activation_d14fd4d_dirty
+ops = torch.ops._activation_d14fd4d_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_d14fd4d_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -2,13 +2,14 @@ import torch
 import torch.nn as nn
 from .poly_norm import PolyNormFunction
 class PolyNorm(nn.Module):
-    def __init__(self, eps=1e-6):
         super().__init__()
-        self.weight = torch.nn.Parameter(torch.ones(3) / 3)
-        self.bias = torch.nn.Parameter(torch.zeros(1))
         self.eps = eps
     def forward(
@@ -16,3 +17,16 @@ class PolyNorm(nn.Module):
         x: torch.Tensor,
     ):
         return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)

 import torch.nn as nn
 from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
 class PolyNorm(nn.Module):
+    def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
         self.eps = eps
     def forward(
         x: torch.Tensor,
     ):
         return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
+        self.eps = eps
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return RMSNormFunction.apply(x, self.weight, self.eps)

build/torch27-cxx11-rocm63-x86_64-linux/activation/rms_norm.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+from ._ops import ops
+# Inherit from Function
+class RMSNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, eps):
+        output = torch.empty_like(input)
+        ops.rms_norm(output, input, weight, eps)
+        return output
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
+        return input_grad, weight_grad, None

tests/conftest.py CHANGED Viewed

@@ -4,7 +4,7 @@ import numpy as np
 import plotly.graph_objects as go
 import pytest
-from .kernels.test_perf import PERF_RESULTS, PerfResult
 logger = logging.getLogger(__name__)
 DO_PLOT = False

 import plotly.graph_objects as go
 import pytest
+from .kernels.test_poly_norm_perf import PERF_RESULTS, PerfResult
 logger = logging.getLogger(__name__)
 DO_PLOT = False

tests/kernels/{test_activation.py → test_poly_norm.py} RENAMED Viewed

File without changes

tests/kernels/{test_perf.py → test_poly_norm_perf.py} RENAMED Viewed

@@ -6,7 +6,7 @@ import torch
 import activation
-from .test_activation import poly_norm
 from .utils import assert_close
 CASES = [

 import activation
+from .test_poly_norm import poly_norm
 from .utils import assert_close
 CASES = [

tests/kernels/test_rms_norm.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import random
+import pytest
+import torch
+import activation
+from .utils import assert_close, opcheck
+DTYPES = [torch.float, torch.bfloat16, torch.half]
+# NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
+# D = [512, 13824]  # Arbitrary values for testing
+NUM_TOKENS = [7, 13]  # Arbitrary values for testing
+D = [513]  # Arbitrary values for testing
+SEEDS = [0]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("d", D)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_rms_norm(
+    num_tokens: int,
+    d: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.set_default_device(device)
+    x = torch.randn(num_tokens, d, dtype=dtype, requires_grad=True)
+    weight = torch.randn(d, dtype=dtype, requires_grad=True)
+    eps = 1e-05
+    x.retain_grad()
+    weight.retain_grad()
+    # To separate gradient computation, clone the inputs
+    x_ref = x.detach().clone().requires_grad_(True)
+    weight_ref = weight.detach().clone().requires_grad_(True)
+    torch_layer = torch.nn.RMSNorm(d, eps=eps, dtype=dtype)
+    torch_layer.weight = torch.nn.Parameter(weight_ref)
+    op = activation.ops.rms_norm
+    fn = activation.rms_norm
+    layer = activation.layers.RMSNorm(d, eps=eps, dtype=dtype)
+    layer.weight = torch.nn.Parameter(weight)
+    out = torch.empty(x.shape, dtype=x.dtype, device=x.device)
+    opcheck(op, (out, x, weight, eps))
+    out = fn(x, weight, eps)
+    mod_out = layer(x)
+    ref_out = torch_layer(x_ref)
+    assert_close(out, ref_out)
+    assert_close(mod_out, out, atol=0.0, rtol=0.0)
+    # test backward pass
+    out_grad = torch.randn_like(out)
+    out_grad = out_grad / out_grad.norm()
+    ref_out.backward(out_grad)
+    mod_out.backward(out_grad)
+    assert_close(x.grad, x_ref.grad)
+    assert_close(layer.weight.grad, torch_layer.weight.grad, rtol=0.05)

torch-ext/activation/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch
 from . import layers
 from ._ops import ops
 from .poly_norm import PolyNormFunction
 def poly_norm(
@@ -14,6 +15,14 @@ def poly_norm(
     return PolyNormFunction.apply(x, weight, bias, eps)
 __all__ = [
     "poly_norm",
     "layers",

 from . import layers
 from ._ops import ops
 from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
 def poly_norm(
     return PolyNormFunction.apply(x, weight, bias, eps)
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return RMSNormFunction.apply(x, weight, eps)
 __all__ = [
     "poly_norm",
     "layers",

torch-ext/activation/layers.py CHANGED Viewed

@@ -2,13 +2,14 @@ import torch
 import torch.nn as nn
 from .poly_norm import PolyNormFunction
 class PolyNorm(nn.Module):
-    def __init__(self, eps=1e-6):
         super().__init__()
-        self.weight = torch.nn.Parameter(torch.ones(3) / 3)
-        self.bias = torch.nn.Parameter(torch.zeros(1))
         self.eps = eps
     def forward(
@@ -16,3 +17,16 @@ class PolyNorm(nn.Module):
         x: torch.Tensor,
     ):
         return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)

 import torch.nn as nn
 from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
 class PolyNorm(nn.Module):
+    def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
         self.eps = eps
     def forward(
         x: torch.Tensor,
     ):
         return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
+        self.eps = eps
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return RMSNormFunction.apply(x, self.weight, self.eps)

torch-ext/activation/rms_norm.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+from ._ops import ops
+# Inherit from Function
+class RMSNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, eps):
+        output = torch.empty_like(input)
+        ops.rms_norm(output, input, weight, eps)
+        return output
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
+        return input_grad, weight_grad, None

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -9,6 +9,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("poly_norm_backward(Tensor! input_grad, Tensor! weight_grad, Tensor! bias_grad, Tensor output_grad, Tensor input, Tensor weight, float eps) -> ()");
   ops.impl("poly_norm", torch::kCUDA, &poly_norm);
   ops.impl("poly_norm_backward", torch::kCUDA, &poly_norm_backward);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

   ops.def("poly_norm_backward(Tensor! input_grad, Tensor! weight_grad, Tensor! bias_grad, Tensor output_grad, Tensor input, Tensor weight, float eps) -> ()");
   ops.impl("poly_norm", torch::kCUDA, &poly_norm);
   ops.impl("poly_norm_backward", torch::kCUDA, &poly_norm_backward);
+  // Activation ops
+  ops.def("rms_norm(Tensor! out, Tensor input, Tensor weight, float eps) -> ()");
+  ops.def("rms_norm_backward(Tensor! input_grad, Tensor! weight_grad, Tensor output_grad, Tensor input, Tensor weight, float eps) -> ()");
+  ops.impl("rms_norm", torch::kCUDA, &rms_norm);
+  ops.impl("rms_norm_backward", torch::kCUDA, &rms_norm_backward);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h CHANGED Viewed

@@ -4,3 +4,6 @@
 void poly_norm(torch::Tensor &out, const torch::Tensor &input, const torch::Tensor &weights, const torch::Tensor &bias, double eps);
 void poly_norm_backward(torch::Tensor& input_grad, torch::Tensor& weight_grad, torch::Tensor& bias_grad, const torch::Tensor& output_grad, const torch::Tensor& input, const torch::Tensor& weight, double eps);

 void poly_norm(torch::Tensor &out, const torch::Tensor &input, const torch::Tensor &weights, const torch::Tensor &bias, double eps);
 void poly_norm_backward(torch::Tensor& input_grad, torch::Tensor& weight_grad, torch::Tensor& bias_grad, const torch::Tensor& output_grad, const torch::Tensor& input, const torch::Tensor& weight, double eps);
+void rms_norm(torch::Tensor &out, const torch::Tensor &input, const torch::Tensor &weights, double eps);
+void rms_norm_backward(torch::Tensor& input_grad, torch::Tensor& weight_grad, const torch::Tensor& output_grad, const torch::Tensor& input, const torch::Tensor& weight, double eps);