iamwyldecat commited on May 30

Commit

44e9845

1 Parent(s): 7a7d761

feat(poly-norm): Add PolyNorm

Browse files

Files changed (21) hide show

.gitignore +193 -0
README.md +4 -0
activation/activation_kernels.cu +267 -0
activation/assert_utils.h +22 -0
activation/atomic_utils.h +73 -0
activation/cuda_compat.h +18 -0
activation/dispatch_utils.h +15 -0
build.toml +20 -0
flake.lock +169 -0
flake.nix +13 -0
tests/__init__.py +0 -0
tests/kernels/__init__.py +0 -0
tests/kernels/allclose_default.py +14 -0
tests/kernels/test_activation.py +91 -0
tests/kernels/utils.py +82 -0
tests/test.py +38 -0
torch-ext/activation/__init__.py +21 -0
torch-ext/activation/layers.py +18 -0
torch-ext/activation/poly_norm.py +41 -0
torch-ext/torch_binding.cpp +14 -0
torch-ext/torch_binding.h +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,193 @@

+# Created by https://www.toptal.com/developers/gitignore/api/vim,python
+# Edit at https://www.toptal.com/developers/gitignore?templates=vim,python
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Distribution / packaging
+.Python
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+### Vim ###
+# Swap
+[._]*.s[a-v][a-z]
+!*.svg  # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+# Session
+Session.vim
+Sessionx.vim
+# Temporary
+.netrwhist
+*~
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+# End of https://www.toptal.com/developers/gitignore/api/vim,python

README.md ADDED Viewed

	@@ -0,0 +1,4 @@

+---
+tags:
+- kernel
+---

activation/activation_kernels.cu ADDED Viewed

	@@ -0,0 +1,267 @@

+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cmath>
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+#include "assert_utils.h"
+#include "atomic_utils.h"
+namespace motif {
+template <typename acc_t, int BLOCK_SIZE>
+__device__ acc_t _block_reduce_sum(volatile acc_t* shared, const float val, const int d) {
+  // TODO: Optimize with warp-level primitives
+  shared[threadIdx.x] = threadIdx.x < d ? val : 0.0f;
+  __syncthreads();
+  for (int stride = BLOCK_SIZE / 2; stride > 0; stride /= 2) {
+    if (threadIdx.x < stride) {
+      shared[threadIdx.x] += shared[threadIdx.x + stride];
+    }
+    __syncthreads();
+  }
+  return shared[0];
+}
+template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void poly_norm_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., d]
+    const scalar_t* __restrict__ weight, // [3]
+    const scalar_t* __restrict__ bias,   // [1]
+    const float eps,
+    const int d
+    ) {
+  const int64_t token_idx = blockIdx.x;
+  acc_t sum = 0.0f;
+  acc_t sum_square = 0.0f;
+  acc_t sum_cube = 0.0f;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    acc_t x = input[token_idx * d + idx];
+    sum += pow(x, 2.0f);
+    sum_square += pow(x, 4.0f);
+    sum_cube += pow(x, 6.0f);
+  }
+  __shared__ acc_t shared[BLOCK_SIZE];
+  acc_t mean = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum, d) / d;
+  acc_t mean_square = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_square, d) / d;
+  acc_t mean_cube = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_cube, d) / d;
+  acc_t w0 = weight[0];
+  acc_t w1 = weight[1];
+  acc_t w2 = weight[2];
+  acc_t b = bias[0];
+  acc_t divisor = sqrt(mean + eps);
+  acc_t divisor_square = sqrt(mean_square + eps);
+  acc_t divisor_cube  = sqrt(mean_cube + eps);
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    acc_t x = input[token_idx * d + idx];
+    acc_t x_square = pow(x, 2.0f);
+    acc_t x_cube = pow(x, 3.0f);
+    out[token_idx * d + idx] = w2 * x / divisor +
+                               w1 * x_square / divisor_square +
+                               w0 * x_cube / divisor_cube + b;
+  }
+}
+template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void poly_norm_backward_kernel(
+    scalar_t* __restrict__ input_grad,         // [..., d]
+    scalar_t* __restrict__ weight_grad,        // [3]
+    scalar_t* __restrict__ bias_grad,          // [1]
+    const scalar_t* __restrict__ output_grad,  // [..., d]
+    const scalar_t* __restrict__ input,        // [..., d]
+    const scalar_t* __restrict__ weight,       // [3]
+    const float eps,
+    const int d
+    ) {
+  const int64_t token_idx = blockIdx.x;
+  acc_t w0 = weight[0];
+  acc_t w1 = weight[1];
+  acc_t w2 = weight[2];
+  acc_t sum_2 = 0.0f;
+  acc_t sum_4 = 0.0f;
+  acc_t sum_6 = 0.0f;
+  acc_t sum_dx_1 = 0.0f;
+  acc_t sum_dx_2 = 0.0f;
+  acc_t sum_dx_3 = 0.0f;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    acc_t dy = output_grad[token_idx * d + idx];
+    acc_t x_1 = input[token_idx * d + idx];
+    acc_t x_2 = x_1 * x_1;
+    acc_t x_3 = x_2 * x_1;
+    acc_t x_4 = x_3 * x_1;
+    acc_t x_6 = x_4 * x_2;
+    sum_2 += x_2;
+    sum_4 += x_4;
+    sum_6 += x_6;
+    sum_dx_1 += w2 * dy * x_1;
+    sum_dx_2 += w1 * dy * x_2;
+    sum_dx_3 += w0 * dy * x_3;
+  }
+  __shared__ acc_t shared[BLOCK_SIZE];
+  acc_t mean_2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_2, d) / d + eps;
+  acc_t mean_4 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_4, d) / d + eps;
+  acc_t mean_6 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_6, d) / d + eps;
+  sum_dx_1 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_1, d);
+  sum_dx_2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_2, d);
+  sum_dx_3 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_3, d);
+  acc_t sq_mean_2 = rsqrtf(mean_2) * mean_2;
+  acc_t sq_mean_4 = rsqrtf(mean_4) * mean_4;
+  acc_t sq_mean_6 = rsqrtf(mean_6) * mean_6;
+	acc_t denom_2 = mean_2 * sq_mean_2;
+	acc_t denom_4 = mean_4 * sq_mean_4;
+	acc_t denom_6 = mean_6 * sq_mean_6;
+	acc_t sum_dw0 = 0;
+	acc_t sum_dw1 = 0;
+	acc_t sum_dw2 = 0;
+	acc_t sum_db = 0;
+	for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+		acc_t dy = output_grad[token_idx * d + idx];
+		acc_t x_1 = input[token_idx * d + idx];
+		acc_t x_2 = x_1 * x_1;
+		acc_t x_3 = x_2 * x_1;
+		acc_t _dx_1 = w2 * dy;
+		acc_t _dx_2 = w1 * dy;
+		acc_t _dx_3 = w0 * dy;
+		acc_t dx_3 =
+			3 * x_2 * (_dx_3 / sq_mean_6 - x_3 * sum_dx_3 / (d * denom_6));
+		acc_t dx_2 =
+			2 * x_1 * (_dx_2 / sq_mean_4 - x_2 * sum_dx_2 / (d * denom_4));
+		acc_t dx_1 =
+			_dx_1 / sq_mean_2 - x_1 * sum_dx_1 / (d * denom_2);
+		if (input_grad) {
+			input_grad[token_idx * d + idx] = dx_1 + dx_2 + dx_3;
+		}
+		sum_dw0 += dy * (x_3 / sq_mean_6);
+		sum_dw1 += dy * (x_2 / sq_mean_4);
+		sum_dw2 += dy * (x_1 / sq_mean_2);
+		sum_db += dy;
+	}
+	if (weight_grad) {
+		sum_dw0 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw0, d);
+		sum_dw1 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw1, d);
+		sum_dw2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw2, d);
+		if (threadIdx.x == 0) {
+			atomic_add(&weight_grad[0], sum_dw0);
+			atomic_add(&weight_grad[1], sum_dw1);
+			atomic_add(&weight_grad[2], sum_dw2);
+		}
+	}
+	if (bias_grad) {
+		sum_db = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_db, d);
+		if (threadIdx.x == 0) {
+			atomic_add(&bias_grad[0], sum_db);
+		}
+	}
+}
+}  // namespace motif
+void poly_norm(torch::Tensor& out,    // [..., d]
+               torch::Tensor& input,  // [..., d]
+	       torch::Tensor& weight, // [3]
+	       torch::Tensor& bias,   // [1]
+	       double eps)
+{
+  AssertTensorShapeEqual(input, out, "input", "out");
+  AssertTensorNotNull(weight, "weight");
+  AssertTensorNotNull(bias, "bias");
+  // TODO shape check
+  constexpr int BLOCK_SIZE = 256;
+  int d = input.size(-1);
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+  dim3 block(BLOCK_SIZE);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  MOTIF_DISPATCH_FLOATING_TYPES(
+    input.scalar_type(), "poly_norm_kernel", [&] {
+      motif::poly_norm_kernel<scalar_t, float, BLOCK_SIZE>
+        <<<grid, block, 0, stream>>>(
+          out.data_ptr<scalar_t>(),
+          input.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          bias.data_ptr<scalar_t>(), eps, d);
+    }
+  );
+}
+void poly_norm_backward(
+  torch::Tensor& input_grad,    // [..., d]
+  torch::Tensor& weight_grad,  // [..., d]
+  torch::Tensor& bias_grad,  // [..., d]
+  torch::Tensor& output_grad, // [3]
+  torch::Tensor& input, // [3]
+  torch::Tensor& weight, // [3]
+  double eps) {
+  AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
+  AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
+  AssertTensorNotNull(weight, "weight");
+  // TODO shape check
+  // weight_grad, bias_grad and input_grad can be nullable
+  constexpr int BLOCK_SIZE = 256;
+  int d = input.size(-1);
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+  dim3 block(BLOCK_SIZE);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+	if (weight_grad.defined())
+		cudaMemsetAsync(weight_grad.data_ptr(), 0, weight_grad.numel() * weight_grad.element_size(), stream);
+	if (bias_grad.defined()) {
+		cudaMemsetAsync(bias_grad.data_ptr(), 0, bias_grad.numel() * bias_grad.element_size(), stream);
+	}
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  MOTIF_DISPATCH_FLOATING_TYPES(
+    input.scalar_type(), "poly_norm_backward_kernel", [&] {
+      motif::poly_norm_backward_kernel<scalar_t, float, BLOCK_SIZE>
+        <<<grid, block, 0, stream>>>(
+          input_grad.data_ptr<scalar_t>(),
+          weight_grad.data_ptr<scalar_t>(),
+          bias_grad.data_ptr<scalar_t>(),
+          output_grad.data_ptr<scalar_t>(),
+          input.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          eps, d);
+    }
+  );
+}

activation/assert_utils.h ADDED Viewed

	@@ -0,0 +1,22 @@

+#pragma once
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+inline void AssertTensorNotNull(const torch::Tensor &tensor, const std::string &name) {
+  TORCH_INTERNAL_ASSERT(tensor.defined(), name + " tensor should not be null.");
+}
+inline void AssertTensorShapeEqual(const torch::Tensor &tensor_a, const torch::Tensor &tensor_b,
+  const std::string &name_a, const std::string &name_b) {
+  AssertTensorNotNull(tensor_a, name_a);
+  AssertTensorNotNull(tensor_b, name_b);
+  auto tensor_shape_a = tensor_a.sizes();
+  auto tensor_shape_b = tensor_b.sizes();
+  TORCH_INTERNAL_ASSERT(tensor_shape_a.equals(tensor_shape_b),
+    "{} tensor shape should be equal to {} tensor shape. (actual: {}, expected: {})",
+    name_a, name_b, tensor_shape_a, tensor_shape_b);
+}

activation/atomic_utils.h ADDED Viewed

	@@ -0,0 +1,73 @@

+#pragma once
+#include <cuda.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+namespace motif {
+template<typename scalar_t, typename acc_t>
+__device__ inline void atomic_add(scalar_t* address, acc_t value) {
+  // TODO: change assert to a static_assert if possible
+	assert(false && "Unsupported type for atomic_add");
+}
+template<>
+__device__ inline void atomic_add<float, float>(float* address, float value) {
+	atomicAdd(address, value);
+}
+template<>
+__device__ inline void atomic_add<double, double>(double* address, double value) {
+	atomicAdd(address, value);
+}
+template<>
+__device__ inline void atomic_add<c10::BFloat16, float>(c10::BFloat16* _address, float value) {
+  volatile c10::BFloat16* address = const_cast<volatile c10::BFloat16*>(_address);
+  size_t offset = (size_t)address & 0x2;
+  volatile uint16_t* address_as_short =
+		reinterpret_cast<volatile uint16_t*>(reinterpret_cast<volatile char*>(address));
+  volatile uint32_t* address_as_uint =
+		reinterpret_cast<volatile uint*>(reinterpret_cast<volatile char*>(address) - offset);
+	bool is_32bit_aligned = offset == 0;
+	uint32_t current = address_as_uint[0];
+	uint32_t expected;
+  do {
+    expected = current;
+	  c10::BFloat16 current_bf16(address_as_short[0], c10::BFloat16::from_bits());
+		c10::BFloat16 next_bf16 = current_bf16 + value;
+		uint32_t next = is_32bit_aligned ? (current & 0xffff0000) | next_bf16.x
+                                     : (current & 0x0000ffff) | (next_bf16.x << 16);
+    current = atomicCAS(const_cast<uint32_t*>(address_as_uint), expected, next);
+  } while (current != expected);
+}
+template<>
+__device__ inline void atomic_add<c10::Half, float>(c10::Half* _address, float value) {
+  volatile c10::Half* address = const_cast<volatile c10::Half*>(_address);
+  size_t offset = (size_t)address & 0x2;
+  volatile uint16_t* address_as_short =
+		reinterpret_cast<volatile uint16_t*>(reinterpret_cast<volatile char*>(address));
+  volatile uint32_t* address_as_uint =
+		reinterpret_cast<volatile uint*>(reinterpret_cast<volatile char*>(address) - offset);
+	bool is_32bit_aligned = offset == 0;
+	uint32_t current = address_as_uint[0];
+	uint32_t expected;
+  do {
+    expected = current;
+	  c10::Half current_half(address_as_short[0], c10::Half::from_bits());
+		c10::Half next_half = current_half + value;
+		uint32_t next = is_32bit_aligned ? (current & 0xffff0000) | next_half.x
+                                     : (current & 0x0000ffff) | (next_half.x << 16);
+    current = atomicCAS(const_cast<uint32_t*>(address_as_uint), expected, next);
+  } while (current != expected);
+}
+} // namespace motif

activation/cuda_compat.h ADDED Viewed

	@@ -0,0 +1,18 @@

+#pragma once
+#ifdef USE_ROCM
+  #include <hip/hip_runtime.h>
+#endif
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+#ifndef USE_ROCM
+  #define VLLM_LDG(arg) __ldg(arg)
+#else
+  #define VLLM_LDG(arg) *(arg)
+#endif

activation/dispatch_utils.h ADDED Viewed

	@@ -0,0 +1,15 @@

+/*
+ * Adapted from
+ * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
+ */
+#pragma once
+#include <torch/all.h>
+#define MOTIF_DISPATCH_CASE_FLOATING_TYPES(...)         \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+#define MOTIF_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, MOTIF_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

build.toml ADDED Viewed

	@@ -0,0 +1,20 @@

+[general]
+name = "activation"
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h"
+]
+[kernel.activation]
+language = "cuda-hipify"
+rocm-archs = [ "gfx90a" ]
+src = [
+  "activation/activation_kernels.cu",
+  "activation/cuda_compat.h",
+  "activation/dispatch_utils.h",
+  "activation/assert_utils.h",
+  "activation/atomic_utils.h",
+]
+depends = [ "torch" ]

flake.lock ADDED Viewed

	@@ -0,0 +1,169 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1747919133,
+        "narHash": "sha256-VvF1naQOvv7yulQ5/cDiaxkNxlh1Y84QMZnderv1szk=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "9c71e026d6c7c8588ef85a5f7c77f57d598e038c",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
+        "nixpkgs": [
+          "kernel-builder",
+          "hf-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1747925434,
+        "narHash": "sha256-yjtdRMyPIFcSF1PkDwU5Rl0bmIpJ5joad5VOt/+1ZLY=",
+        "ref": "refs/heads/main",
+        "rev": "fd0376ff1fec423c91589075fb9042767558c635",
+        "shallow": true,
+        "type": "git",
+        "url": "file:///home/nixuser/kernel-builder"
+      },
+      "original": {
+        "shallow": true,
+        "type": "git",
+        "url": "file:///home/nixuser/kernel-builder"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1747820358,
+        "narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
+        "owner": "danieldk",
+        "repo": "nixpkgs",
+        "rev": "d3c1681180717528068082103bf323147de6ab0b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "danieldk",
+        "ref": "cudatoolkit-12.9-kernel-builder",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  description = "Flake for Torch kernel extension";
+  inputs = {
+    kernel-builder.url = "/home/nixuser/kernel-builder";
+  };
+  outputs = { self, kernel-builder, }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+    };
+}

tests/__init__.py ADDED Viewed

File without changes

tests/kernels/__init__.py ADDED Viewed

File without changes

tests/kernels/allclose_default.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import torch
+# Reference default values of atol and rtol are from
+# https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
+default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
+default_rtol = {torch.float16: 1e-3, torch.bfloat16: 1.6e-2, torch.float: 1.3e-6}
+def get_default_atol(output) -> float:
+    return default_atol[output.dtype]
+def get_default_rtol(output) -> float:
+    return default_rtol[output.dtype]

tests/kernels/test_activation.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import random
+import pytest
+import torch
+import activation
+from .utils import assert_close, opcheck
+DTYPES = [torch.float, torch.bfloat16, torch.half]
+# NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
+# D = [512, 13824]  # Arbitrary values for testing
+NUM_TOKENS = [7, 13]  # Arbitrary values for testing
+D = [513]  # Arbitrary values for testing
+SEEDS = [0]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+def norm(x, eps: float) -> torch.Tensor:
+    return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+def poly_norm(
+    x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, eps: float
+) -> torch.Tensor:
+    x = x.float()
+    return (
+        weight[0] * norm(x**3, eps)
+        + weight[1] * norm(x**2, eps)
+        + weight[2] * norm(x, eps)
+        + bias
+    ).to(weight.dtype)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("d", D)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_poly_norm(
+    num_tokens: int,
+    d: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.set_default_device(device)
+    x = torch.randn(num_tokens, d, dtype=dtype, requires_grad=True)
+    weight = torch.randn(3, dtype=dtype, requires_grad=True)
+    bias = torch.randn(1, dtype=dtype, requires_grad=True)
+    eps = 1e-05
+    x.retain_grad()
+    weight.retain_grad()
+    bias.retain_grad()
+    # To separate gradient computation, clone the inputs
+    x_ref = x.detach().clone().requires_grad_(True)
+    weight_ref = weight.detach().clone().requires_grad_(True)
+    bias_ref = bias.detach().clone().requires_grad_(True)
+    torch_fn = poly_norm
+    op = activation.ops.poly_norm
+    fn = activation.poly_norm
+    layer = activation.layers.PolyNorm(eps)
+    layer.weight = torch.nn.Parameter(weight)
+    layer.bias = torch.nn.Parameter(bias)
+    out = torch.empty(x.shape, dtype=x.dtype, device=x.device)
+    opcheck(op, (out, x, weight, bias, eps))
+    out = fn(x, weight, bias, eps)
+    mod_out = layer(x)
+    ref_out = torch_fn(x_ref, weight_ref, bias_ref, eps)
+    assert_close(out, ref_out)
+    assert_close(mod_out, out, atol=0.0, rtol=0.0)
+    # test backward pass
+    out_grad = torch.randn_like(out)
+    out_grad = out_grad / out_grad.norm()
+    ref_out.backward(out_grad)
+    mod_out.backward(out_grad)
+    assert_close(x.grad, x_ref.grad)
+    assert_close(layer.bias.grad, bias_ref.grad, rtol=0.05)
+    assert_close(layer.weight.grad, weight_ref.grad, rtol=0.05)

tests/kernels/utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""Kernel test utils"""
+import unittest
+from typing import Any, Dict, Optional, Sequence, Tuple, Union
+import torch
+from torch._prims_common import TensorLikeType
+from .allclose_default import get_default_atol, get_default_rtol
+# For now, disable "test_aot_dispatch_dynamic" since there are some
+# bugs related to this test in PyTorch 2.4.
+DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
+    "test_schema",
+    "test_autograd_registration",
+    "test_faketensor",
+)
+ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
+    "test_schema",
+    "test_autograd_registration",
+    "test_faketensor",
+    "test_aot_dispatch_dynamic",
+)
+def assert_close(
+    a: TensorLikeType,
+    b: TensorLikeType,
+    atol: float | None = None,
+    rtol: float | None = None,
+) -> None:
+    atol = atol if atol is not None else get_default_atol(a)
+    rtol = rtol if rtol is not None else get_default_rtol(a)
+    torch.testing.assert_close(a, b, atol=atol, rtol=rtol)
+# Copied/modified from torch._refs.__init__.py
+def fp8_allclose(
+    a: TensorLikeType,
+    b: TensorLikeType,
+    rtol: float = 1e-05,
+    atol: float = 1e-08,
+    equal_nan: bool = False,
+) -> bool:
+    """
+    Reference implementation of torch.allclose
+    """
+    torch._refs._check_close_args(name="torch.allclose", a=a, b=b, rtol=rtol, atol=atol)
+    return bool(
+        torch.all(
+            torch.isclose(
+                a.double(), b.double(), rtol=rtol, atol=atol, equal_nan=equal_nan
+            )
+        ).item()
+    )
+# A special version of op check that has a restricted default set of test_utils
+# and a patched version of allclose that supports fp8 types.
+def opcheck(
+    op: Union[
+        torch._ops.OpOverload,
+        torch._ops.OpOverloadPacket,
+        torch._library.custom_ops.CustomOpDef,
+    ],
+    args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]] = None,
+    *,
+    test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
+    raise_exception: bool = True,
+    cond: bool = True,
+) -> Dict[str, str]:
+    with unittest.mock.patch("torch.allclose", new=fp8_allclose):
+        return (
+            torch.library.opcheck(
+                op, args, kwargs, test_utils=test_utils, raise_exception=raise_exception
+            )
+            if cond
+            else {}
+        )

tests/test.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import activation
+import torch
+def norm(x, eps: float) -> torch.Tensor:
+    return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+def poly_norm(
+    x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, eps: float
+) -> torch.Tensor:
+    x = x.float()
+    return (
+        weight[0] * norm(x**3, eps)
+        + weight[1] * norm(x**2, eps)
+        + weight[2] * norm(x, eps)
+        + bias
+    ).to(weight.dtype)
+dtype = torch.bfloat16
+torch.set_default_device("cuda:0")
+a = torch.randn(3, 3, dtype=dtype, requires_grad=True)
+w = torch.randn(3, dtype=dtype, requires_grad=True)
+b = torch.randn(1, dtype=dtype, requires_grad=True)
+a.retain_grad()
+w.retain_grad()
+b.retain_grad()
+out = activation.poly_norm(a, w, b, 1e-6)
+# out = poly_norm(a, w, b, 1e-6)
+out.backward(torch.ones_like(out))
+print(a.grad)
+print(w.grad)
+print(b.grad)

torch-ext/activation/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from . import layers
+from ._ops import ops
+from .poly_norm import PolyNormFunction
+def poly_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return PolyNormFunction.apply(x, weight, bias, eps)
+__all__ = [
+    "poly_norm",
+    "layers",
+    "ops",
+]

torch-ext/activation/layers.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torch
+import torch.nn as nn
+from .poly_norm import PolyNormFunction
+class PolyNorm(nn.Module):
+    def __init__(self, eps):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1))
+        self.eps = eps
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)

torch-ext/activation/poly_norm.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+from ._ops import ops
+# Inherit from Function
+class PolyNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, bias, eps):
+        output = torch.empty_like(input)
+        ops.poly_norm(output, input, weight, bias, eps)
+        return output
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, bias, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (
+            torch.empty(1, dtype=weight.dtype, device=weight.device)
+            if ctx.needs_input_grad[2]
+            else None
+        )
+        ops.poly_norm_backward(
+            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
+        )
+        return input_grad, weight_grad, bias_grad, None

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,14 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  // Activation ops
+  ops.def("poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float eps) -> ()");
+  ops.def("poly_norm_backward(Tensor! input_grad, Tensor! weight_grad, Tensor! bias_grad, Tensor output_grad, Tensor input, Tensor weight, float eps) -> ()");
+  ops.impl("poly_norm", torch::kCUDA, &poly_norm);
+  ops.impl("poly_norm_backward", torch::kCUDA, &poly_norm_backward);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,6 @@

+#pragma once
+#include <torch/torch.h>
+void poly_norm(torch::Tensor &out, torch::Tensor &input, torch::Tensor &weights, torch::Tensor &bias, double eps);
+void poly_norm_backward(torch::Tensor& input_grad, torch::Tensor& weight_grad, torch::Tensor& bias_grad, torch::Tensor& output_grad, torch::Tensor& input, torch::Tensor& weight, double eps);