Optimize kernel

by TaehyunKimMotif - opened 27 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+955

-517

This view is limited to 50 files because it contains too many changes. See the raw diff here.

Files changed (50) hide show

.pre-commit-config.yaml +36 -0
README.md +45 -0
activation/assert_utils.h +9 -5
activation/atomic_utils.h +38 -31
activation/block_reduce.h +3 -2
activation/cuda_compat.h +9 -7
activation/dispatch_utils.h +6 -5
activation/poly_norm.cu +465 -156
activation/rms_norm.cu +42 -59
build/torch27-cxx11-cu118-x86_64-linux/activation/{_activation_cf68df1_dirty.abi3.so → _activation_f517c97_dirty.abi3.so} +2 -2
build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py +2 -0
build/torch27-cxx11-cu118-x86_64-linux/activation/poly_norm.py +9 -11
build/torch27-cxx11-cu118-x86_64-linux/activation/rms_norm.py +6 -3
build/torch27-cxx11-cu126-x86_64-linux/activation/{_activation_cf68df1_dirty.abi3.so → _activation_f517c97_dirty.abi3.so} +2 -2
build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py +3 -3
build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py +2 -0
build/torch27-cxx11-cu126-x86_64-linux/activation/poly_norm.py +9 -11
build/torch27-cxx11-cu126-x86_64-linux/activation/rms_norm.py +6 -3
build/torch27-cxx11-cu128-x86_64-linux/activation/{_activation_cf68df1_dirty.abi3.so → _activation_f517c97_dirty.abi3.so} +2 -2
build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py +3 -3
build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py +2 -0
build/torch27-cxx11-cu128-x86_64-linux/activation/poly_norm.py +9 -11
build/torch27-cxx11-cu128-x86_64-linux/activation/rms_norm.py +6 -3
build/torch27-cxx11-rocm63-x86_64-linux/activation/{_activation_cf68df1_dirty.abi3.so → _activation_f517c97_dirty.abi3.so} +2 -2
build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/activation/layers.py +2 -0
build/torch27-cxx11-rocm63-x86_64-linux/activation/poly_norm.py +9 -11
build/torch27-cxx11-rocm63-x86_64-linux/activation/rms_norm.py +6 -3
build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_cf68df1_dirty.abi3.so +0 -3
build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_f517c97_dirty.abi3.so +3 -0
build/torch28-cxx11-cu126-x86_64-linux/activation/_ops.py +3 -3
build/torch28-cxx11-cu126-x86_64-linux/activation/layers.py +2 -0
build/torch28-cxx11-cu126-x86_64-linux/activation/poly_norm.py +9 -11
build/torch28-cxx11-cu126-x86_64-linux/activation/rms_norm.py +6 -3
build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_cf68df1_dirty.abi3.so +0 -3
build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_f517c97_dirty.abi3.so +3 -0
build/torch28-cxx11-cu128-x86_64-linux/activation/_ops.py +3 -3
build/torch28-cxx11-cu128-x86_64-linux/activation/layers.py +2 -0
build/torch28-cxx11-cu128-x86_64-linux/activation/poly_norm.py +9 -11
build/torch28-cxx11-cu128-x86_64-linux/activation/rms_norm.py +6 -3
build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_cf68df1_dirty.abi3.so +0 -3
build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_f517c97_dirty.abi3.so +3 -0
build/torch28-cxx11-cu129-x86_64-linux/activation/_ops.py +3 -3
build/torch28-cxx11-cu129-x86_64-linux/activation/layers.py +2 -0
build/torch28-cxx11-cu129-x86_64-linux/activation/poly_norm.py +9 -11
build/torch28-cxx11-cu129-x86_64-linux/activation/rms_norm.py +6 -3
build/torch28-cxx11-rocm63-x86_64-linux/activation/_activation_cf68df1_dirty.abi3.so +0 -3
build/torch28-cxx11-rocm63-x86_64-linux/activation/_activation_f517c97_dirty.abi3.so +3 -0
build/torch28-cxx11-rocm63-x86_64-linux/activation/_ops.py +3 -3

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+default_install_hook_types:
+  - pre-commit
+  - commit-msg
+default_stages:
+  - pre-commit # Run locally
+  - manual # Run in CI
+exclude: '(build|result)/.*'
+repos:
+- repo: https://github.com/google/yapf
+  rev: v0.43.0
+  hooks:
+  - id: yapf
+    args: [--in-place, --verbose]
+- repo: https://github.com/crate-ci/typos
+  rev: v1.34.0
+  hooks:
+  - id: typos
+- repo: https://github.com/PyCQA/isort
+  rev: 6.0.1
+  hooks:
+  - id: isort
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: v20.1.3
+  hooks:
+  - id: clang-format
+    types_or: [c++, cuda]
+    args: [--style=file, --verbose]
+- repo: https://github.com/jackdewinter/pymarkdown
+  rev: v0.9.29
+  hooks:
+  - id: pymarkdown
+    args: [fix]
+- repo: https://github.com/rhysd/actionlint
+  rev: v1.7.7
+  hooks:
+  - id: actionlint

README.md CHANGED Viewed

@@ -32,6 +32,7 @@ print(poly_norm(x))
 - Test cases are from the Motif LLM
 - You can reproduce the results with:
 ```bash
 cd tests
 pytest --run-perf --do-plot
@@ -39,3 +40,47 @@ pytest --run-perf --do-plot
 ![PolyNorm Performance](./tests/perf.png)

 - Test cases are from the Motif LLM
 - You can reproduce the results with:
 ```bash
 cd tests
 pytest --run-perf --do-plot
 ![PolyNorm Performance](./tests/perf.png)
+## Pre-commit Hooks
+This project uses [pre-commit](https://pre-commit.com/) to automatically check and format code before commits.
+### Setup
+1. Install pre-commit:
+   ```bash
+   pip install pre-commit
+   ```
+2. Install the git hooks:
+```bash
+   pre-commit install
+   ```
+Once installed, the configured hooks will run automatically on each commit.
+### Included Hooks
+The following tools are run via pre-commit:
+- **[yapf](https://github.com/google/yapf)** – Python code formatter
+- **[typos](https://github.com/crate-ci/typos)** – Spell checker for common typos
+- **[isort](https://github.com/PyCQA/isort)** – Organizes and sorts Python imports
+- **[clang-format](https://clang.llvm.org/docs/ClangFormat.html)** – Formats C++/CUDA code (`--style=file`)
+- **[pymarkdown](https://github.com/jackdewinter/pymarkdown)** – Lints and auto-fixes Markdown files
+- **[actionlint](https://github.com/rhysd/actionlint)** – Validates GitHub Actions workflows
+### Usage
+- Run all checks on the entire codebase:
+   ```bash
+   pre-commit run --all-files
+   ```
+- Run a specific hook (example: isort):
+ ```bash
+   pre-commit run isort --all-files
+   ```

activation/assert_utils.h CHANGED Viewed

@@ -3,12 +3,15 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
-inline void AssertTensorNotNull(const torch::Tensor &tensor, const std::string &name) {
   TORCH_INTERNAL_ASSERT(tensor.defined(), name + " tensor should not be null.");
 }
-inline void AssertTensorShapeEqual(const torch::Tensor &tensor_a, const torch::Tensor &tensor_b,
-  const std::string &name_a, const std::string &name_b) {
   AssertTensorNotNull(tensor_a, name_a);
   AssertTensorNotNull(tensor_b, name_b);
@@ -17,6 +20,7 @@ inline void AssertTensorShapeEqual(const torch::Tensor &tensor_a, const torch::T
   auto tensor_shape_b = tensor_b.sizes();
   TORCH_INTERNAL_ASSERT(tensor_shape_a.equals(tensor_shape_b),
-    "{} tensor shape should be equal to {} tensor shape. (actual: {}, expected: {})",
-    name_a, name_b, tensor_shape_a, tensor_shape_b);
 }

 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
+inline void AssertTensorNotNull(const torch::Tensor &tensor,
+                                const std::string &name) {
   TORCH_INTERNAL_ASSERT(tensor.defined(), name + " tensor should not be null.");
 }
+inline void AssertTensorShapeEqual(const torch::Tensor &tensor_a,
+                                   const torch::Tensor &tensor_b,
+                                   const std::string &name_a,
+                                   const std::string &name_b) {
   AssertTensorNotNull(tensor_a, name_a);
   AssertTensorNotNull(tensor_b, name_b);
   auto tensor_shape_b = tensor_b.sizes();
   TORCH_INTERNAL_ASSERT(tensor_shape_a.equals(tensor_shape_b),
+                        "{} tensor shape should be equal to {} tensor shape. "
+                        "(actual: {}, expected: {})",
+                        name_a, name_b, tensor_shape_a, tensor_shape_b);
 }

activation/atomic_utils.h CHANGED Viewed

@@ -1,35 +1,38 @@
 #pragma once
-#include <cuda.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/Half.h>
 namespace motif {
-template<typename scalar_t, typename acc_t>
-__device__ inline void atomic_add(scalar_t* address, acc_t value) {
   // TODO: change assert to a static_assert if possible
-	assert(false && "Unsupported type for atomic_add");
 }
-template<>
-__device__ inline void atomic_add<float, float>(float* address, float value) {
-	atomicAdd(address, value);
 }
-template<>
-__device__ inline void atomic_add<double, double>(double* address, double value) {
-	atomicAdd(address, value);
 }
-template<>
-__device__ inline void atomic_add<c10::BFloat16, float>(c10::BFloat16* _address, float value) {
-  volatile c10::BFloat16* address = const_cast<volatile c10::BFloat16*>(_address);
   size_t offset = (size_t)address & 0x2;
-  volatile uint16_t* address_as_short =
-      reinterpret_cast<volatile uint16_t*>(reinterpret_cast<volatile char*>(address));
-  volatile uint32_t* address_as_uint =
-      reinterpret_cast<volatile uint*>(reinterpret_cast<volatile char*>(address) - offset);
   bool is_32bit_aligned = offset == 0;
   uint32_t current = address_as_uint[0];
@@ -39,21 +42,24 @@ __device__ inline void atomic_add<c10::BFloat16, float>(c10::BFloat16* _address,
     expected = current;
     c10::BFloat16 current_bf16(address_as_short[0], c10::BFloat16::from_bits());
     c10::BFloat16 next_bf16 = current_bf16 + value;
-    uint32_t next = is_32bit_aligned ? (current & 0xffff0000) | next_bf16.x
-                                     : (current & 0x0000ffff) | (next_bf16.x << 16);
-    current = atomicCAS(const_cast<uint32_t*>(address_as_uint), expected, next);
   } while (current != expected);
 }
-template<>
-__device__ inline void atomic_add<c10::Half, float>(c10::Half* _address, float value) {
-  volatile c10::Half* address = const_cast<volatile c10::Half*>(_address);
   size_t offset = (size_t)address & 0x2;
-  volatile uint16_t* address_as_short =
-      reinterpret_cast<volatile uint16_t*>(reinterpret_cast<volatile char*>(address));
-  volatile uint32_t* address_as_uint =
-      reinterpret_cast<volatile uint*>(reinterpret_cast<volatile char*>(address) - offset);
   bool is_32bit_aligned = offset == 0;
   uint32_t current = address_as_uint[0];
@@ -63,11 +69,12 @@ __device__ inline void atomic_add<c10::Half, float>(c10::Half* _address, float v
     expected = current;
     c10::Half current_half(address_as_short[0], c10::Half::from_bits());
     c10::Half next_half = current_half + value;
-    uint32_t next = is_32bit_aligned ? (current & 0xffff0000) | next_half.x
-                                     : (current & 0x0000ffff) | (next_half.x << 16);
-    current = atomicCAS(const_cast<uint32_t*>(address_as_uint), expected, next);
   } while (current != expected);
 }
 } // namespace motif

 #pragma once
 #include <c10/util/BFloat16.h>
 #include <c10/util/Half.h>
+#include <cuda.h>
 namespace motif {
+template <typename scalar_t, typename acc_t>
+__device__ inline void atomic_add(scalar_t *address, acc_t value) {
   // TODO: change assert to a static_assert if possible
+  assert(false && "Unsupported type for atomic_add");
 }
+template <>
+__device__ inline void atomic_add<float, float>(float *address, float value) {
+  atomicAdd(address, value);
 }
+template <>
+__device__ inline void atomic_add<double, double>(double *address,
+                                                  double value) {
+  atomicAdd(address, value);
 }
+template <>
+__device__ inline void atomic_add<c10::BFloat16, float>(c10::BFloat16 *_address,
+                                                        float value) {
+  volatile c10::BFloat16 *address =
+      const_cast<volatile c10::BFloat16 *>(_address);
   size_t offset = (size_t)address & 0x2;
+  volatile uint16_t *address_as_short = reinterpret_cast<volatile uint16_t *>(
+      reinterpret_cast<volatile char *>(address));
+  volatile uint32_t *address_as_uint = reinterpret_cast<volatile uint *>(
+      reinterpret_cast<volatile char *>(address) - offset);
   bool is_32bit_aligned = offset == 0;
   uint32_t current = address_as_uint[0];
     expected = current;
     c10::BFloat16 current_bf16(address_as_short[0], c10::BFloat16::from_bits());
     c10::BFloat16 next_bf16 = current_bf16 + value;
+    uint32_t next = is_32bit_aligned
+                        ? (current & 0xffff0000) | next_bf16.x
+                        : (current & 0x0000ffff) | (next_bf16.x << 16);
+    current =
+        atomicCAS(const_cast<uint32_t *>(address_as_uint), expected, next);
   } while (current != expected);
 }
+template <>
+__device__ inline void atomic_add<c10::Half, float>(c10::Half *_address,
+                                                    float value) {
+  volatile c10::Half *address = const_cast<volatile c10::Half *>(_address);
   size_t offset = (size_t)address & 0x2;
+  volatile uint16_t *address_as_short = reinterpret_cast<volatile uint16_t *>(
+      reinterpret_cast<volatile char *>(address));
+  volatile uint32_t *address_as_uint = reinterpret_cast<volatile uint *>(
+      reinterpret_cast<volatile char *>(address) - offset);
   bool is_32bit_aligned = offset == 0;
   uint32_t current = address_as_uint[0];
     expected = current;
     c10::Half current_half(address_as_short[0], c10::Half::from_bits());
     c10::Half next_half = current_half + value;
+    uint32_t next = is_32bit_aligned
+                        ? (current & 0xffff0000) | next_half.x
+                        : (current & 0x0000ffff) | (next_half.x << 16);
+    current =
+        atomicCAS(const_cast<uint32_t *>(address_as_uint), expected, next);
   } while (current != expected);
 }
 } // namespace motif

activation/block_reduce.h CHANGED Viewed

@@ -1,7 +1,8 @@
 namespace motif {
 template <typename acc_t, int BLOCK_SIZE>
-__device__ acc_t _block_reduce_sum(acc_t* shared, const float val, const int d) {
   // TODO: Optimize with warp-level primitives
   __syncthreads();
@@ -17,4 +18,4 @@ __device__ acc_t _block_reduce_sum(acc_t* shared, const float val, const int d)
   return shared[0];
 }
-} // motif

 namespace motif {
 template <typename acc_t, int BLOCK_SIZE>
+__device__ acc_t _block_reduce_sum(acc_t *shared, const float val,
+                                   const int d) {
   // TODO: Optimize with warp-level primitives
   __syncthreads();
   return shared[0];
 }
+} // namespace motif

activation/cuda_compat.h CHANGED Viewed

@@ -1,18 +1,20 @@
 #pragma once
-#ifdef USE_ROCM
-  #include <hip/hip_runtime.h>
 #endif
 #ifndef USE_ROCM
-  #define WARP_SIZE 32
 #else
-  #define WARP_SIZE warpSize
 #endif
 #ifndef USE_ROCM
-  #define VLLM_LDG(arg) __ldg(arg)
 #else
-  #define VLLM_LDG(arg) *(arg)
 #endif

 #pragma once
+#ifndef USE_ROCM
+#include <cub/cub.cuh>
+#else
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
 #endif
 #ifndef USE_ROCM
+#define WARP_SIZE 32
 #else
+#define WARP_SIZE warpSize
 #endif
 #ifndef USE_ROCM
+#define VLLM_LDG(arg) __ldg(arg)
 #else
+#define VLLM_LDG(arg) *(arg)
 #endif

activation/dispatch_utils.h CHANGED Viewed

@@ -6,10 +6,11 @@
 #include <torch/all.h>
-#define MOTIF_DISPATCH_CASE_FLOATING_TYPES(...)         \
-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
-  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
-#define MOTIF_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(TYPE, NAME, MOTIF_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

 #include <torch/all.h>
+#define MOTIF_DISPATCH_CASE_FLOATING_TYPES(...)                                \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)                          \
   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+#define MOTIF_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                         \
+  AT_DISPATCH_SWITCH(TYPE, NAME,                                               \
+                     MOTIF_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

activation/poly_norm.cu CHANGED Viewed

@@ -1,246 +1,555 @@
-#include <ATen/cuda/CUDAContext.h>
 #include <ATen/Functions.h>
-#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <cmath>
-#include "cuda_compat.h"
-#include "dispatch_utils.h"
 #include "assert_utils.h"
 #include "atomic_utils.h"
 #include "block_reduce.h"
 namespace motif {
-template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
-__global__ void poly_norm_kernel(
-    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., d]
-    const scalar_t* __restrict__ weight, // [3]
-    const scalar_t* __restrict__ bias,   // [1]
-    const float eps,
-    const int d
-    ) {
   const int64_t token_idx = blockIdx.x;
-  acc_t sum = 0.0f;
-  acc_t sum_square = 0.0f;
-  acc_t sum_cube = 0.0f;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    acc_t x = input[token_idx * d + idx];
-    sum += pow(x, 2.0f);
-    sum_square += pow(x, 4.0f);
-    sum_cube += pow(x, 6.0f);
   }
-  __shared__ acc_t shared[BLOCK_SIZE];
-  acc_t mean = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum, d) / d;
-  acc_t mean_square = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_square, d) / d;
-  acc_t mean_cube = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_cube, d) / d;
-  acc_t w0 = weight[0];
-  acc_t w1 = weight[1];
-  acc_t w2 = weight[2];
-  acc_t b = bias[0];
-  acc_t divisor = sqrt(mean + eps);
-  acc_t divisor_square = sqrt(mean_square + eps);
-  acc_t divisor_cube  = sqrt(mean_cube + eps);
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    acc_t x = input[token_idx * d + idx];
-    acc_t x_square = pow(x, 2.0f);
-    acc_t x_cube = pow(x, 3.0f);
-    out[token_idx * d + idx] = w2 * x / divisor +
-                               w1 * x_square / divisor_square +
-                               w0 * x_cube / divisor_cube + b;
   }
 }
-template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
-__global__ void poly_norm_backward_kernel(
-    scalar_t* __restrict__ input_grad,         // [..., d]
-    acc_t* __restrict__ temp_weight_grad,      // [..., 3]
-    const scalar_t* __restrict__ output_grad,  // [..., d]
-    const scalar_t* __restrict__ input,        // [..., d]
-    const scalar_t* __restrict__ weight,       // [3]
-    const float eps,
-    const int d
-    ) {
-  const int64_t token_idx = blockIdx.x;
   acc_t w0 = weight[0];
   acc_t w1 = weight[1];
   acc_t w2 = weight[2];
-  acc_t sum_2 = 0.0f;
-  acc_t sum_4 = 0.0f;
-  acc_t sum_6 = 0.0f;
-  acc_t sum_dx_1 = 0.0f;
-  acc_t sum_dx_2 = 0.0f;
-  acc_t sum_dx_3 = 0.0f;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     acc_t dy = output_grad[token_idx * d + idx];
-    acc_t x_1 = input[token_idx * d + idx];
-    acc_t x_2 = x_1 * x_1;
-    acc_t x_3 = x_2 * x_1;
-    acc_t x_4 = x_2 * x_2;
-    acc_t x_6 = x_3 * x_3;
-    sum_2 += x_2;
-    sum_4 += x_4;
-    sum_6 += x_6;
-    sum_dx_1 += dy * x_1;
-    sum_dx_2 += dy * x_2;
-    sum_dx_3 += dy * x_3;
   }
-  __shared__ acc_t shared[BLOCK_SIZE];
-  acc_t mean_2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_2, d) / d + eps;
-  acc_t mean_4 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_4, d) / d + eps;
-  acc_t mean_6 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_6, d) / d + eps;
-  sum_dx_1 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_1, d);
-  sum_dx_2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_2, d);
-  sum_dx_3 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_3, d);
-  acc_t _mean_2 = powf(mean_2, -1.5);
-  acc_t _mean_4 = powf(mean_4, -1.5);
-  acc_t _mean_6 = powf(mean_6, -1.5);
-  acc_t sq_mean_2 = sqrtf(mean_2);
-  acc_t sq_mean_4 = sqrtf(mean_4);
-  acc_t sq_mean_6 = sqrtf(mean_6);
   acc_t sum_dw0 = 0;
   acc_t sum_dw1 = 0;
   acc_t sum_dw2 = 0;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     acc_t dy = output_grad[token_idx * d + idx];
-    acc_t x_1 = input[token_idx * d + idx];
-    acc_t x_2 = x_1 * x_1;
-    acc_t x_3 = x_2 * x_1;
-    acc_t dx_3 =
-      _mean_6 * 3 * x_2 * (dy * mean_6 - x_3 * sum_dx_3 / d) * w0;
-    acc_t dx_2 =
-      _mean_4 * 2 * x_1 * (dy * mean_4 - x_2 * sum_dx_2 / d) * w1;
-    acc_t dx_1 =
-      _mean_2 * (dy * mean_2 - x_1 * sum_dx_1 / d) * w2;
     if (input_grad) {
-      input_grad[token_idx * d + idx] = dx_1 + dx_2 + dx_3;
     }
-    sum_dw0 += dy * (x_3 / sq_mean_6);
-    sum_dw1 += dy * (x_2 / sq_mean_4);
-    sum_dw2 += dy * (x_1 / sq_mean_2);
   }
-  if (temp_weight_grad) {
-    sum_dw0 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw0, d);
-    sum_dw1 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw1, d);
-    sum_dw2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw2, d);
-    if (threadIdx.x == 0) {
-      temp_weight_grad[token_idx * 3 + 0] = sum_dw0;
-      temp_weight_grad[token_idx * 3 + 1] = sum_dw1;
-      temp_weight_grad[token_idx * 3 + 2] = sum_dw2;
-    }
   }
 }
-}  // namespace motif
-void poly_norm(torch::Tensor& out,          // [..., d]
-               const torch::Tensor& input,  // [..., d]
-               const torch::Tensor& weight, // [3]
-               const torch::Tensor& bias,   // [1]
-               double eps)
-{
   AssertTensorShapeEqual(input, out, "input", "out");
   AssertTensorNotNull(weight, "weight");
   AssertTensorNotNull(bias, "bias");
   // TODO shape check
-  constexpr int BLOCK_SIZE = 256;
   int d = input.size(-1);
-  int64_t num_tokens = input.numel() / input.size(-1);
   dim3 grid(num_tokens);
-  dim3 block(BLOCK_SIZE);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  MOTIF_DISPATCH_FLOATING_TYPES(
-    input.scalar_type(), "poly_norm_kernel", [&] {
-      motif::poly_norm_kernel<scalar_t, float, BLOCK_SIZE>
-        <<<grid, block, 0, stream>>>(
-          out.data_ptr<scalar_t>(),
-          input.data_ptr<scalar_t>(),
-          weight.data_ptr<scalar_t>(),
-          bias.data_ptr<scalar_t>(), eps, d);
-    }
-  );
 }
-void poly_norm_backward(
-  torch::Tensor& input_grad,        // [..., d]
-  torch::Tensor& weight_grad,       // [..., d]
-  torch::Tensor& bias_grad,         // [..., d]
-  const torch::Tensor& output_grad, // [3]
-  const torch::Tensor& input,       // [3]
-  const torch::Tensor& weight,      // [3]
-  double eps) {
   AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
   AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
   AssertTensorNotNull(weight, "weight");
   // TODO shape check
   // weight_grad, bias_grad and input_grad can be nullable
-  constexpr int BLOCK_SIZE = 256;
   int d = input.size(-1);
-  int64_t num_tokens = input.numel() / input.size(-1);
   dim3 grid(num_tokens);
-  dim3 block(BLOCK_SIZE);
   torch::Tensor temp_weight_grad =
-    torch::empty({num_tokens, 3},
-    input.options().dtype(torch::kFloat));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  MOTIF_DISPATCH_FLOATING_TYPES(
-    input.scalar_type(), "poly_norm_backward_kernel", [&] {
-      motif::poly_norm_backward_kernel<scalar_t, float, BLOCK_SIZE>
-        <<<grid, block, 0, stream>>>(
-          input_grad.data_ptr<scalar_t>(),
-          temp_weight_grad.data_ptr<float>(),
-          output_grad.data_ptr<scalar_t>(),
-          input.data_ptr<scalar_t>(),
-          weight.data_ptr<scalar_t>(),
-          eps, d);
-    }
-  );
   if (bias_grad.defined()) {
-    at::sum_out(bias_grad, output_grad);
-    bias_grad.resize_({1});
   }
   if (weight_grad.defined()) {
-    at::sum_out(weight_grad, temp_weight_grad, {0});
   }
 }

 #include <ATen/Functions.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
 #include <cmath>
 #include "assert_utils.h"
 #include "atomic_utils.h"
 #include "block_reduce.h"
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
 namespace motif {
+template <typename type, int N> struct alignas(sizeof(type) * N) type_vec_t {
+  type data[N];
+};
+template <typename scalar_t, typename acc_t, int width>
+__global__ std::enable_if_t<(width > 0)>
+poly_norm_kernel(scalar_t *__restrict__ out,          // [..., d]
+                 const scalar_t *__restrict__ input,  // [..., d]
+                 const scalar_t *__restrict__ weight, // [3]
+                 const scalar_t *__restrict__ bias,   // [1]
+                 const float eps, const int d) {
+  using vec_t = type_vec_t<scalar_t, width>;
+  const int vec_d = d / width;
+  const int64_t vec_offset = blockIdx.x * vec_d;
+  const vec_t *__restrict__ input_vec = reinterpret_cast<const vec_t *>(input);
+  acc_t sum2 = 0.0f;
+  acc_t sum4 = 0.0f;
+  acc_t sum6 = 0.0f;
+  for (int64_t idx = threadIdx.x; idx < vec_d; idx += blockDim.x) {
+    vec_t x_vec = input_vec[vec_offset + idx];
+#pragma unroll
+    for (int i = 0; i < width; ++i) {
+      acc_t x1 = static_cast<acc_t>(x_vec.data[i]);
+      acc_t x2 = x1 * x1;
+      acc_t x4 = x2 * x2;
+      acc_t x6 = x4 * x2;
+      sum2 += x2;
+      sum4 += x4;
+      sum6 += x6;
+    }
+  }
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  sum2 = BlockReduce(reduceStore).Sum(sum2, blockDim.x);
+  __syncthreads();
+  sum4 = BlockReduce(reduceStore).Sum(sum4, blockDim.x);
+  __syncthreads();
+  sum6 = BlockReduce(reduceStore).Sum(sum6, blockDim.x);
+  __shared__ acc_t s_bias;
+  __shared__ acc_t s_w2_inv_std1;
+  __shared__ acc_t s_w1_inv_std2;
+  __shared__ acc_t s_w0_inv_std3;
+  if (threadIdx.x == 0) {
+    acc_t w0 = weight[0];
+    acc_t w1 = weight[1];
+    acc_t w2 = weight[2];
+    s_bias = bias[0];
+    s_w2_inv_std1 = rsqrtf(sum2 / d + eps) * w2;
+    s_w1_inv_std2 = rsqrtf(sum4 / d + eps) * w1;
+    s_w0_inv_std3 = rsqrtf(sum6 / d + eps) * w0;
+  }
+  __syncthreads();
+  acc_t w2_inv_std1 = s_w2_inv_std1;
+  acc_t w1_inv_std2 = s_w1_inv_std2;
+  acc_t w0_inv_std3 = s_w0_inv_std3;
+  acc_t bias_reg = s_bias;
+  vec_t *__restrict__ output_vec = reinterpret_cast<vec_t *>(out);
+  for (int64_t idx = threadIdx.x; idx < vec_d; idx += blockDim.x) {
+    vec_t x_vec = input_vec[vec_offset + idx];
+    vec_t y_vec;
+#pragma unroll
+    for (int i = 0; i < width; ++i) {
+      acc_t x1 = static_cast<acc_t>(x_vec.data[i]);
+      acc_t x2 = x1 * x1;
+      acc_t x3 = x2 * x1;
+      acc_t y =
+          x1 * w2_inv_std1 + x2 * w1_inv_std2 + x3 * w0_inv_std3 + bias_reg;
+      y_vec.data[i] = static_cast<scalar_t>(y);
+    }
+    output_vec[vec_offset + idx] = y_vec;
+  }
+}
+template <typename scalar_t, typename acc_t, int width>
+__global__ std::enable_if_t<(width == 0)>
+poly_norm_kernel(scalar_t *__restrict__ out,          // [..., d]
+                 const scalar_t *__restrict__ input,  // [..., d]
+                 const scalar_t *__restrict__ weight, // [3]
+                 const scalar_t *__restrict__ bias,   // [1]
+                 const float eps, const int d) {
   const int64_t token_idx = blockIdx.x;
+  acc_t sum2 = 0.0f;
+  acc_t sum4 = 0.0f;
+  acc_t sum6 = 0.0f;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    acc_t x1 = input[token_idx * d + idx];
+    acc_t x2 = x1 * x1;
+    acc_t x4 = x2 * x2;
+    acc_t x6 = x4 * x2;
+    sum2 += x2;
+    sum4 += x4;
+    sum6 += x6;
   }
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  sum2 = BlockReduce(reduceStore).Sum(sum2, blockDim.x);
+  __syncthreads();
+  sum4 = BlockReduce(reduceStore).Sum(sum4, blockDim.x);
+  __syncthreads();
+  sum6 = BlockReduce(reduceStore).Sum(sum6, blockDim.x);
+  __shared__ acc_t s_bias;
+  __shared__ acc_t s_w2_inv_std1;
+  __shared__ acc_t s_w1_inv_std2;
+  __shared__ acc_t s_w0_inv_std3;
+  if (threadIdx.x == 0) {
+    acc_t w0 = weight[0];
+    acc_t w1 = weight[1];
+    acc_t w2 = weight[2];
+    s_bias = bias[0];
+    s_w2_inv_std1 = rsqrtf(sum2 / d + eps) * w2;
+    s_w1_inv_std2 = rsqrtf(sum4 / d + eps) * w1;
+    s_w0_inv_std3 = rsqrtf(sum6 / d + eps) * w0;
+  }
+  __syncthreads();
+  acc_t w2_inv_std1 = s_w2_inv_std1;
+  acc_t w1_inv_std2 = s_w1_inv_std2;
+  acc_t w0_inv_std3 = s_w0_inv_std3;
+  acc_t bias_reg = s_bias;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    acc_t x1 = input[token_idx * d + idx];
+    acc_t x2 = x1 * x1;
+    acc_t x3 = x2 * x1;
+    out[token_idx * d + idx] =
+        x1 * w2_inv_std1 + x2 * w1_inv_std2 + x3 * w0_inv_std3 + bias_reg;
   }
 }
+template <typename scalar_t, typename acc_t, int width>
+__global__ std::enable_if_t<(width > 0)>
+poly_norm_backward_kernel(scalar_t *__restrict__ input_grad,        // [..., d]
+                          acc_t *__restrict__ temp_weight_grad,     // [..., 3]
+                          acc_t *__restrict__ temp_bias_grad,       // [..., 1]
+                          const scalar_t *__restrict__ output_grad, // [..., d]
+                          const scalar_t *__restrict__ input,       // [..., d]
+                          const scalar_t *__restrict__ weight,      // [3]
+                          const float eps, const int d) {
+  using vec_t = type_vec_t<scalar_t, width>;
+  const int vec_d = d / width;
+  const int64_t vec_offset = blockIdx.x * vec_d;
+  const vec_t *__restrict__ input_vec = reinterpret_cast<const vec_t *>(input);
+  const vec_t *__restrict__ output_grad_vec =
+      reinterpret_cast<const vec_t *>(output_grad);
+  acc_t sum2 = 0.0f;
+  acc_t sum4 = 0.0f;
+  acc_t sum6 = 0.0f;
+  acc_t sum_dx1 = 0.0f;
+  acc_t sum_dx2 = 0.0f;
+  acc_t sum_dx3 = 0.0f;
+  for (int64_t idx = threadIdx.x; idx < vec_d; idx += blockDim.x) {
+    vec_t x_vec = input_vec[vec_offset + idx];
+    vec_t dy_vec = output_grad_vec[vec_offset + idx];
+#pragma unroll
+    for (int i = 0; i < width; ++i) {
+      acc_t x1 = static_cast<acc_t>(x_vec.data[i]);
+      acc_t x2 = x1 * x1;
+      acc_t x3 = x2 * x1;
+      acc_t x4 = x2 * x2;
+      acc_t x6 = x3 * x3;
+      sum2 += x2;
+      sum4 += x4;
+      sum6 += x6;
+      acc_t dy = static_cast<acc_t>(dy_vec.data[i]);
+      sum_dx1 += dy * x1;
+      sum_dx2 += dy * x2;
+      sum_dx3 += dy * x3;
+    }
+  }
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  __syncthreads();
+  sum2 = BlockReduce(reduceStore).Sum(sum2, blockDim.x);
+  __syncthreads();
+  sum4 = BlockReduce(reduceStore).Sum(sum4, blockDim.x);
+  __syncthreads();
+  sum6 = BlockReduce(reduceStore).Sum(sum6, blockDim.x);
+  __syncthreads();
+  sum_dx1 = BlockReduce(reduceStore).Sum(sum_dx1, blockDim.x);
+  __syncthreads();
+  sum_dx2 = BlockReduce(reduceStore).Sum(sum_dx2, blockDim.x);
+  __syncthreads();
+  sum_dx3 = BlockReduce(reduceStore).Sum(sum_dx3, blockDim.x);
+  __shared__ acc_t s_mean2;
+  __shared__ acc_t s_mean4;
+  __shared__ acc_t s_mean6;
+  __shared__ acc_t s_sdx1;
+  __shared__ acc_t s_sdx2;
+  __shared__ acc_t s_sdx3;
+  const acc_t inv_d = acc_t(1) / d;
+  if (threadIdx.x == 0) {
+    s_mean2 = sum2 * inv_d + eps;
+    s_mean4 = sum4 * inv_d + eps;
+    s_mean6 = sum6 * inv_d + eps;
+    s_sdx1 = sum_dx1 * inv_d;
+    s_sdx2 = sum_dx2 * inv_d;
+    s_sdx3 = sum_dx3 * inv_d;
+  }
+  __syncthreads();
   acc_t w0 = weight[0];
   acc_t w1 = weight[1];
   acc_t w2 = weight[2];
+  acc_t mean2 = s_mean2;
+  acc_t mean4 = s_mean4;
+  acc_t mean6 = s_mean6;
+  acc_t sdx1 = s_sdx1;
+  acc_t sdx2 = s_sdx2;
+  acc_t sdx3 = s_sdx3;
+  acc_t inv_std1 = rsqrtf(mean2);
+  acc_t inv_std2 = rsqrtf(mean4);
+  acc_t inv_std3 = rsqrtf(mean6);
+  // inv_std / mean == powf(mean, -1.5)
+  acc_t c1 = w2 * inv_std1 / mean2;
+  acc_t c2 = acc_t(2) * w1 * inv_std2 / mean4;
+  acc_t c3 = acc_t(3) * w0 * inv_std3 / mean6;
+  acc_t sum_dy = 0;
+  acc_t sum_dw0 = 0;
+  acc_t sum_dw1 = 0;
+  acc_t sum_dw2 = 0;
+  vec_t *__restrict__ input_grad_vec = reinterpret_cast<vec_t *>(input_grad);
+  for (int64_t idx = threadIdx.x; idx < vec_d; idx += blockDim.x) {
+    vec_t x_vec = input_vec[vec_offset + idx];
+    vec_t dy_vec = output_grad_vec[vec_offset + idx];
+    vec_t dx_vec;
+#pragma unroll
+    for (int i = 0; i < width; ++i) {
+      acc_t x1 = static_cast<acc_t>(x_vec.data[i]);
+      acc_t x2 = x1 * x1;
+      acc_t x3 = x2 * x1;
+      acc_t dy = static_cast<acc_t>(dy_vec.data[i]);
+      if (input_grad) {
+        acc_t dx3 = c3 * x2 * (dy * mean6 - x3 * sdx3);
+        acc_t dx2 = c2 * x1 * (dy * mean4 - x2 * sdx2);
+        acc_t dx1 = c1 * (dy * mean2 - x1 * sdx1);
+        dx_vec.data[i] = static_cast<scalar_t>(dx1 + dx2 + dx3);
+      }
+      sum_dy += dy;
+      sum_dw0 += dy * (x3 * inv_std3);
+      sum_dw1 += dy * (x2 * inv_std2);
+      sum_dw2 += dy * (x1 * inv_std1);
+    }
+    if (input_grad) {
+      input_grad_vec[vec_offset + idx] = dx_vec;
+    }
+  }
+  sum_dy = BlockReduce(reduceStore).Sum(sum_dy, blockDim.x);
+  __syncthreads();
+  sum_dw0 = BlockReduce(reduceStore).Sum(sum_dw0, blockDim.x);
+  __syncthreads();
+  sum_dw1 = BlockReduce(reduceStore).Sum(sum_dw1, blockDim.x);
+  __syncthreads();
+  sum_dw2 = BlockReduce(reduceStore).Sum(sum_dw2, blockDim.x);
+  if (threadIdx.x == 0) {
+    temp_bias_grad[blockIdx.x] = sum_dy;
+    temp_weight_grad[blockIdx.x * 3 + 0] = sum_dw0;
+    temp_weight_grad[blockIdx.x * 3 + 1] = sum_dw1;
+    temp_weight_grad[blockIdx.x * 3 + 2] = sum_dw2;
+  }
+}
+template <typename scalar_t, typename acc_t, int width>
+__global__ std::enable_if_t<(width == 0)>
+poly_norm_backward_kernel(scalar_t *__restrict__ input_grad,        // [..., d]
+                          acc_t *__restrict__ temp_weight_grad,     // [..., 3]
+                          acc_t *__restrict__ temp_bias_grad,       // [..., 1]
+                          const scalar_t *__restrict__ output_grad, // [..., d]
+                          const scalar_t *__restrict__ input,       // [..., d]
+                          const scalar_t *__restrict__ weight,      // [3]
+                          const float eps, const int d) {
+  const int64_t token_idx = blockIdx.x;
+  acc_t sum2 = 0.0f;
+  acc_t sum4 = 0.0f;
+  acc_t sum6 = 0.0f;
+  acc_t sum_dx1 = 0.0f;
+  acc_t sum_dx2 = 0.0f;
+  acc_t sum_dx3 = 0.0f;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     acc_t dy = output_grad[token_idx * d + idx];
+    acc_t x1 = input[token_idx * d + idx];
+    acc_t x2 = x1 * x1;
+    acc_t x3 = x2 * x1;
+    acc_t x4 = x2 * x2;
+    acc_t x6 = x3 * x3;
+    sum2 += x2;
+    sum4 += x4;
+    sum6 += x6;
+    sum_dx1 += dy * x1;
+    sum_dx2 += dy * x2;
+    sum_dx3 += dy * x3;
   }
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  __syncthreads();
+  sum2 = BlockReduce(reduceStore).Sum(sum2, blockDim.x);
+  __syncthreads();
+  sum4 = BlockReduce(reduceStore).Sum(sum4, blockDim.x);
+  __syncthreads();
+  sum6 = BlockReduce(reduceStore).Sum(sum6, blockDim.x);
+  __syncthreads();
+  sum_dx1 = BlockReduce(reduceStore).Sum(sum_dx1, blockDim.x);
+  __syncthreads();
+  sum_dx2 = BlockReduce(reduceStore).Sum(sum_dx2, blockDim.x);
+  __syncthreads();
+  sum_dx3 = BlockReduce(reduceStore).Sum(sum_dx3, blockDim.x);
+  __shared__ acc_t s_mean2;
+  __shared__ acc_t s_mean4;
+  __shared__ acc_t s_mean6;
+  __shared__ acc_t s_sdx1;
+  __shared__ acc_t s_sdx2;
+  __shared__ acc_t s_sdx3;
+  const acc_t inv_d = acc_t(1) / d;
+  if (threadIdx.x == 0) {
+    s_mean2 = sum2 * inv_d + eps;
+    s_mean4 = sum4 * inv_d + eps;
+    s_mean6 = sum6 * inv_d + eps;
+    s_sdx1 = sum_dx1 * inv_d;
+    s_sdx2 = sum_dx2 * inv_d;
+    s_sdx3 = sum_dx3 * inv_d;
+  }
+  __syncthreads();
+  acc_t w0 = weight[0];
+  acc_t w1 = weight[1];
+  acc_t w2 = weight[2];
+  acc_t mean2 = s_mean2;
+  acc_t mean4 = s_mean4;
+  acc_t mean6 = s_mean6;
+  acc_t sdx1 = s_sdx1;
+  acc_t sdx2 = s_sdx2;
+  acc_t sdx3 = s_sdx3;
+  acc_t inv_std1 = rsqrtf(mean2);
+  acc_t inv_std2 = rsqrtf(mean4);
+  acc_t inv_std3 = rsqrtf(mean6);
+  // inv_std / mean == powf(mean, -1.5)
+  acc_t c1 = w2 * inv_std1 / mean2;
+  acc_t c2 = acc_t(2) * w1 * inv_std2 / mean4;
+  acc_t c3 = acc_t(3) * w0 * inv_std3 / mean6;
+  acc_t sum_dy = 0;
   acc_t sum_dw0 = 0;
   acc_t sum_dw1 = 0;
   acc_t sum_dw2 = 0;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     acc_t dy = output_grad[token_idx * d + idx];
+    acc_t x1 = input[token_idx * d + idx];
+    acc_t x2 = x1 * x1;
+    acc_t x3 = x2 * x1;
     if (input_grad) {
+      acc_t dx3 = c3 * x2 * (dy * mean6 - x3 * sdx3);
+      acc_t dx2 = c2 * x1 * (dy * mean4 - x2 * sdx2);
+      acc_t dx1 = c1 * (dy * mean2 - x1 * sdx1);
+      input_grad[token_idx * d + idx] = dx1 + dx2 + dx3;
     }
+    sum_dy += dy;
+    sum_dw0 += dy * (x3 * inv_std3);
+    sum_dw1 += dy * (x2 * inv_std2);
+    sum_dw2 += dy * (x1 * inv_std1);
   }
+  sum_dy = BlockReduce(reduceStore).Sum(sum_dy, blockDim.x);
+  __syncthreads();
+  sum_dw0 = BlockReduce(reduceStore).Sum(sum_dw0, blockDim.x);
+  __syncthreads();
+  sum_dw1 = BlockReduce(reduceStore).Sum(sum_dw1, blockDim.x);
+  __syncthreads();
+  sum_dw2 = BlockReduce(reduceStore).Sum(sum_dw2, blockDim.x);
+  if (threadIdx.x == 0) {
+    temp_bias_grad[token_idx] = sum_dy;
+    temp_weight_grad[token_idx * 3 + 0] = sum_dw0;
+    temp_weight_grad[token_idx * 3 + 1] = sum_dw1;
+    temp_weight_grad[token_idx * 3 + 2] = sum_dw2;
   }
 }
+} // namespace motif
+#define LAUNCH_POLY_NORM(width)                                                \
+  MOTIF_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
+    motif::poly_norm_kernel<scalar_t, float, width>                            \
+        <<<grid, block, 0, stream>>>(                                          \
+            out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),              \
+            weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), eps, d);   \
+  });
+void poly_norm(torch::Tensor &out,          // [..., d]
+               const torch::Tensor &input,  // [..., d]
+               const torch::Tensor &weight, // [3]
+               const torch::Tensor &bias,   // [1]
+               double eps) {
   AssertTensorShapeEqual(input, out, "input", "out");
   AssertTensorNotNull(weight, "weight");
   AssertTensorNotNull(bias, "bias");
   // TODO shape check
   int d = input.size(-1);
+  int64_t num_tokens = input.numel() / d;
   dim3 grid(num_tokens);
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
+  dim3 block(std::min(d, max_block_size));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (d % 8 == 0) {
+    LAUNCH_POLY_NORM(8);
+  } else {
+    LAUNCH_POLY_NORM(0);
+  }
 }
+#define LAUNCH_POLY_NORM_BACKWARD(width)                                       \
+  MOTIF_DISPATCH_FLOATING_TYPES(                                               \
+      input.scalar_type(), "poly_norm_backward_kernel", [&] {                  \
+        motif::poly_norm_backward_kernel<scalar_t, float, width>               \
+            <<<grid, block, 0, stream>>>(input_grad.data_ptr<scalar_t>(),      \
+                                         temp_weight_grad.data_ptr<float>(),   \
+                                         temp_bias_grad.data_ptr<float>(),     \
+                                         output_grad.data_ptr<scalar_t>(),     \
+                                         input.data_ptr<scalar_t>(),           \
+                                         weight.data_ptr<scalar_t>(), eps, d); \
+      });
+void poly_norm_backward(torch::Tensor &input_grad,        // [..., d]
+                        torch::Tensor &weight_grad,       // [3]
+                        torch::Tensor &bias_grad,         // [1]
+                        const torch::Tensor &output_grad, // [..., d]
+                        const torch::Tensor &input,       // [..., d]
+                        const torch::Tensor &weight,      // [3]
+                        double eps) {
   AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
   AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
   AssertTensorNotNull(weight, "weight");
   // TODO shape check
   // weight_grad, bias_grad and input_grad can be nullable
   int d = input.size(-1);
+  int64_t num_tokens = input.numel() / d;
   dim3 grid(num_tokens);
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
+  dim3 block(std::min(d, max_block_size));
   torch::Tensor temp_weight_grad =
+      torch::empty({num_tokens, 3}, input.options().dtype(torch::kFloat));
+  torch::Tensor temp_bias_grad =
+      torch::empty({num_tokens, 1}, output_grad.options().dtype(torch::kFloat));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (d % 8 == 0) {
+    LAUNCH_POLY_NORM_BACKWARD(8);
+  } else {
+    LAUNCH_POLY_NORM_BACKWARD(0);
+  }
   if (bias_grad.defined()) {
+    torch::Tensor acc = torch::empty_like(bias_grad, temp_bias_grad.options());
+    at::sum_out(acc, temp_bias_grad, {0});
+    bias_grad.copy_(acc);
   }
   if (weight_grad.defined()) {
+    torch::Tensor acc =
+        torch::empty_like(weight_grad, temp_weight_grad.options());
+    at::sum_out(acc, temp_weight_grad, {0});
+    weight_grad.copy_(acc);
   }
 }

activation/rms_norm.cu CHANGED Viewed

@@ -1,26 +1,23 @@
-#include <ATen/cuda/CUDAContext.h>
 #include <ATen/Functions.h>
-#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <cmath>
-#include "cuda_compat.h"
-#include "dispatch_utils.h"
 #include "assert_utils.h"
 #include "atomic_utils.h"
 #include "block_reduce.h"
 namespace motif {
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
-__global__ void rms_norm_kernel(
-    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., d]
-    const scalar_t* __restrict__ weight, // [d]
-    const float eps,
-    const int d
-    ) {
   const int64_t token_idx = blockIdx.x;
   const int64_t vec_idx = threadIdx.x;
@@ -44,15 +41,13 @@ __global__ void rms_norm_kernel(
 }
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
-__global__ void rms_norm_backward_kernel(
-    scalar_t* __restrict__ input_grad,         // [..., d]
-    acc_t* __restrict__ temp_weight_grad,      // [..., d]
-    const scalar_t* __restrict__ output_grad,  // [..., d]
-    const scalar_t* __restrict__ input,        // [..., d]
-    const scalar_t* __restrict__ weight,       // [d]
-    const float eps,
-    const int d
-    ) {
   const int64_t token_idx = blockIdx.x;
   const int64_t vec_idx = threadIdx.x;
   acc_t d_sum = 0.0f;
@@ -80,8 +75,7 @@ __global__ void rms_norm_backward_kernel(
     acc_t dy = output_grad[token_idx * d + idx];
     acc_t w = weight[idx];
-    input_grad[token_idx * d + idx] =
-      scale * dy * w - dxx * x;
     if (temp_weight_grad) {
       temp_weight_grad[token_idx * d + idx] = dy * x * scale;
@@ -89,14 +83,12 @@ __global__ void rms_norm_backward_kernel(
   }
 }
-}  // namespace motif
-void rms_norm(torch::Tensor& out,           // [..., d]
-               const torch::Tensor& input,  // [..., d]
-               const torch::Tensor& weight, // [d]
-               double eps)
-{
   AssertTensorShapeEqual(input, out, "input", "out");
   AssertTensorNotNull(weight, "weight");
   // TODO shape check
@@ -110,25 +102,20 @@ void rms_norm(torch::Tensor& out,           // [..., d]
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  MOTIF_DISPATCH_FLOATING_TYPES(
-    input.scalar_type(), "rms_norm_kernel", [&] {
-      motif::rms_norm_kernel<scalar_t, float, BLOCK_SIZE>
-        <<<grid, block, 0, stream>>>(
-          out.data_ptr<scalar_t>(),
-          input.data_ptr<scalar_t>(),
-          weight.data_ptr<scalar_t>(),
-          eps, d);
-    }
-  );
 }
-void rms_norm_backward(
-  torch::Tensor& input_grad,        // [..., d]
-  torch::Tensor& weight_grad,       // [..., d]
-  const torch::Tensor& output_grad, // [d]
-  const torch::Tensor& input,       // [d]
-  const torch::Tensor& weight,      // [d]
-  double eps) {
   AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
   AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
   AssertTensorNotNull(weight, "weight");
@@ -143,24 +130,20 @@ void rms_norm_backward(
   dim3 block(BLOCK_SIZE);
   torch::Tensor temp_weight_grad =
-    torch::empty({num_tokens, d},
-    input.options().dtype(torch::kFloat));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   MOTIF_DISPATCH_FLOATING_TYPES(
-    input.scalar_type(), "rms_norm_backward_kernel", [&] {
-      motif::rms_norm_backward_kernel<scalar_t, float, BLOCK_SIZE>
-        <<<grid, block, 0, stream>>>(
-          input_grad.data_ptr<scalar_t>(),
-          temp_weight_grad.data_ptr<float>(),
-          output_grad.data_ptr<scalar_t>(),
-          input.data_ptr<scalar_t>(),
-          weight.data_ptr<scalar_t>(),
-          eps, d);
-    }
-  );
   if (weight_grad.defined()) {
     at::sum_out(weight_grad, temp_weight_grad, {0});

 #include <ATen/Functions.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
 #include <cmath>
 #include "assert_utils.h"
 #include "atomic_utils.h"
 #include "block_reduce.h"
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
 namespace motif {
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void rms_norm_kernel(scalar_t *__restrict__ out,          // [..., d]
+                                const scalar_t *__restrict__ input,  // [..., d]
+                                const scalar_t *__restrict__ weight, // [d]
+                                const float eps, const int d) {
   const int64_t token_idx = blockIdx.x;
   const int64_t vec_idx = threadIdx.x;
 }
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void
+rms_norm_backward_kernel(scalar_t *__restrict__ input_grad,        // [..., d]
+                         acc_t *__restrict__ temp_weight_grad,     // [..., d]
+                         const scalar_t *__restrict__ output_grad, // [..., d]
+                         const scalar_t *__restrict__ input,       // [..., d]
+                         const scalar_t *__restrict__ weight,      // [d]
+                         const float eps, const int d) {
   const int64_t token_idx = blockIdx.x;
   const int64_t vec_idx = threadIdx.x;
   acc_t d_sum = 0.0f;
     acc_t dy = output_grad[token_idx * d + idx];
     acc_t w = weight[idx];
+    input_grad[token_idx * d + idx] = scale * dy * w - dxx * x;
     if (temp_weight_grad) {
       temp_weight_grad[token_idx * d + idx] = dy * x * scale;
   }
 }
+} // namespace motif
+void rms_norm(torch::Tensor &out,          // [..., d]
+              const torch::Tensor &input,  // [..., d]
+              const torch::Tensor &weight, // [d]
+              double eps) {
   AssertTensorShapeEqual(input, out, "input", "out");
   AssertTensorNotNull(weight, "weight");
   // TODO shape check
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  MOTIF_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
+    motif::rms_norm_kernel<scalar_t, float, BLOCK_SIZE>
+        <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                     input.data_ptr<scalar_t>(),
+                                     weight.data_ptr<scalar_t>(), eps, d);
+  });
 }
+void rms_norm_backward(torch::Tensor &input_grad,        // [..., d]
+                       torch::Tensor &weight_grad,       // [..., d]
+                       const torch::Tensor &output_grad, // [d]
+                       const torch::Tensor &input,       // [d]
+                       const torch::Tensor &weight,      // [d]
+                       double eps) {
   AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
   AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
   AssertTensorNotNull(weight, "weight");
   dim3 block(BLOCK_SIZE);
   torch::Tensor temp_weight_grad =
+      torch::empty({num_tokens, d}, input.options().dtype(torch::kFloat));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   MOTIF_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "rms_norm_backward_kernel", [&] {
+        motif::rms_norm_backward_kernel<scalar_t, float, BLOCK_SIZE>
+            <<<grid, block, 0, stream>>>(input_grad.data_ptr<scalar_t>(),
+                                         temp_weight_grad.data_ptr<float>(),
+                                         output_grad.data_ptr<scalar_t>(),
+                                         input.data_ptr<scalar_t>(),
+                                         weight.data_ptr<scalar_t>(), eps, d);
+      });
   if (weight_grad.defined()) {
     at::sum_out(weight_grad, temp_weight_grad, {0});

build/torch27-cxx11-cu118-x86_64-linux/activation/{_activation_cf68df1_dirty.abi3.so → _activation_f517c97_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1655e52503ce7d0b7dabd55b97c1bd7d11071cbe0f80b9e810c443523638fd9b
-size 2994312

 version https://git-lfs.github.com/spec/v1
+oid sha256:bd84c828d4c15e96d65d6c8f0eb7a945ee8167d92e978b2ebce03eeaf41e7fce
+size 4405112

build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_cf68df1_dirty
-ops = torch.ops._activation_cf68df1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_cf68df1_dirty::{op_name}"

 import torch
+from . import _activation_f517c97_dirty
+ops = torch.ops._activation_f517c97_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_f517c97_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .rms_norm import RMSNormFunction
 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
@@ -28,6 +29,7 @@ class PolyNorm(nn.Module):
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

build/torch27-cxx11-cu118-x86_64-linux/activation/poly_norm.py CHANGED Viewed

@@ -26,16 +26,14 @@ class PolyNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        bias_grad = (
-            torch.empty(1, dtype=weight.dtype, device=weight.device)
-            if ctx.needs_input_grad[2]
-            else None
-        )
-        ops.poly_norm_backward(
-            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
-        )
         return input_grad, weight_grad, bias_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (torch.empty(1, dtype=weight.dtype, device=weight.device)
+                     if ctx.needs_input_grad[2] else None)
+        ops.poly_norm_backward(input_grad, weight_grad, bias_grad, output_grad,
+                               input, weight, eps)
         return input_grad, weight_grad, bias_grad, None

build/torch27-cxx11-cu118-x86_64-linux/activation/rms_norm.py CHANGED Viewed

@@ -26,9 +26,12 @@ class RMSNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
         return input_grad, weight_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input,
+                              weight, eps)
         return input_grad, weight_grad, None

build/torch27-cxx11-cu126-x86_64-linux/activation/{_activation_cf68df1_dirty.abi3.so → _activation_f517c97_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:174dbe4375aa22fb34d9d23630b3bec4eeb95635ef681b665db0985e78cf5af3
-size 3027504

 version https://git-lfs.github.com/spec/v1
+oid sha256:caffcadbb99fbaa27e8a81d5ef508f2e1a798e7626d618c3cf5b0d387d2c8686
+size 4618624

build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_cf68df1_dirty
-ops = torch.ops._activation_cf68df1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_cf68df1_dirty::{op_name}"

 import torch
+from . import _activation_f517c97_dirty
+ops = torch.ops._activation_f517c97_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_f517c97_dirty::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .rms_norm import RMSNormFunction
 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
@@ -28,6 +29,7 @@ class PolyNorm(nn.Module):
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

build/torch27-cxx11-cu126-x86_64-linux/activation/poly_norm.py CHANGED Viewed

@@ -26,16 +26,14 @@ class PolyNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        bias_grad = (
-            torch.empty(1, dtype=weight.dtype, device=weight.device)
-            if ctx.needs_input_grad[2]
-            else None
-        )
-        ops.poly_norm_backward(
-            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
-        )
         return input_grad, weight_grad, bias_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (torch.empty(1, dtype=weight.dtype, device=weight.device)
+                     if ctx.needs_input_grad[2] else None)
+        ops.poly_norm_backward(input_grad, weight_grad, bias_grad, output_grad,
+                               input, weight, eps)
         return input_grad, weight_grad, bias_grad, None

build/torch27-cxx11-cu126-x86_64-linux/activation/rms_norm.py CHANGED Viewed

@@ -26,9 +26,12 @@ class RMSNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
         return input_grad, weight_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input,
+                              weight, eps)
         return input_grad, weight_grad, None

build/torch27-cxx11-cu128-x86_64-linux/activation/{_activation_cf68df1_dirty.abi3.so → _activation_f517c97_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:91d71ca84a19b393c22b269226a7b4ddadbf1feec73a80bd45f655179c7a53f5
-size 3987512

 version https://git-lfs.github.com/spec/v1
+oid sha256:3b7c6ece8e8d316c4cc5fe46b1cec4422b2f61e9bb7240af71a2b4a35975d8e6
+size 6676528

build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_cf68df1_dirty
-ops = torch.ops._activation_cf68df1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_cf68df1_dirty::{op_name}"

 import torch
+from . import _activation_f517c97_dirty
+ops = torch.ops._activation_f517c97_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_f517c97_dirty::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .rms_norm import RMSNormFunction
 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
@@ -28,6 +29,7 @@ class PolyNorm(nn.Module):
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

build/torch27-cxx11-cu128-x86_64-linux/activation/poly_norm.py CHANGED Viewed

@@ -26,16 +26,14 @@ class PolyNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        bias_grad = (
-            torch.empty(1, dtype=weight.dtype, device=weight.device)
-            if ctx.needs_input_grad[2]
-            else None
-        )
-        ops.poly_norm_backward(
-            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
-        )
         return input_grad, weight_grad, bias_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (torch.empty(1, dtype=weight.dtype, device=weight.device)
+                     if ctx.needs_input_grad[2] else None)
+        ops.poly_norm_backward(input_grad, weight_grad, bias_grad, output_grad,
+                               input, weight, eps)
         return input_grad, weight_grad, bias_grad, None

build/torch27-cxx11-cu128-x86_64-linux/activation/rms_norm.py CHANGED Viewed

@@ -26,9 +26,12 @@ class RMSNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
         return input_grad, weight_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input,
+                              weight, eps)
         return input_grad, weight_grad, None

build/torch27-cxx11-rocm63-x86_64-linux/activation/{_activation_cf68df1_dirty.abi3.so → _activation_f517c97_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ab1037bf6b41bf2be1d00a6a0ed01a97a5e4d64dd0abaf509492ad31eea0a576
-size 2642976

 version https://git-lfs.github.com/spec/v1
+oid sha256:4be173820e2a4bf4b6b8de6b63faf6544b599d9b0583f650a940adaef4a048b3
+size 2899184

build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_cf68df1_dirty
-ops = torch.ops._activation_cf68df1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_cf68df1_dirty::{op_name}"

 import torch
+from . import _activation_f517c97_dirty
+ops = torch.ops._activation_f517c97_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_f517c97_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .rms_norm import RMSNormFunction
 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
@@ -28,6 +29,7 @@ class PolyNorm(nn.Module):
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

build/torch27-cxx11-rocm63-x86_64-linux/activation/poly_norm.py CHANGED Viewed

@@ -26,16 +26,14 @@ class PolyNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        bias_grad = (
-            torch.empty(1, dtype=weight.dtype, device=weight.device)
-            if ctx.needs_input_grad[2]
-            else None
-        )
-        ops.poly_norm_backward(
-            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
-        )
         return input_grad, weight_grad, bias_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (torch.empty(1, dtype=weight.dtype, device=weight.device)
+                     if ctx.needs_input_grad[2] else None)
+        ops.poly_norm_backward(input_grad, weight_grad, bias_grad, output_grad,
+                               input, weight, eps)
         return input_grad, weight_grad, bias_grad, None

build/torch27-cxx11-rocm63-x86_64-linux/activation/rms_norm.py CHANGED Viewed

@@ -26,9 +26,12 @@ class RMSNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
         return input_grad, weight_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input,
+                              weight, eps)
         return input_grad, weight_grad, None

build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_cf68df1_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:012788f2064588edf60df24778dff33f8ca95e3b1aaf5243554735cd783dd7ed
-size 3032488

build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_f517c97_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb222449350310f90f7271f34fcf9052c9eec28021fee0348130a8f239a97bf4
+size 4571976

build/torch28-cxx11-cu126-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_cf68df1_dirty
-ops = torch.ops._activation_cf68df1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_cf68df1_dirty::{op_name}"

 import torch
+from . import _activation_f517c97_dirty
+ops = torch.ops._activation_f517c97_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_f517c97_dirty::{op_name}"

build/torch28-cxx11-cu126-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .rms_norm import RMSNormFunction
 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
@@ -28,6 +29,7 @@ class PolyNorm(nn.Module):
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

build/torch28-cxx11-cu126-x86_64-linux/activation/poly_norm.py CHANGED Viewed

@@ -26,16 +26,14 @@ class PolyNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        bias_grad = (
-            torch.empty(1, dtype=weight.dtype, device=weight.device)
-            if ctx.needs_input_grad[2]
-            else None
-        )
-        ops.poly_norm_backward(
-            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
-        )
         return input_grad, weight_grad, bias_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (torch.empty(1, dtype=weight.dtype, device=weight.device)
+                     if ctx.needs_input_grad[2] else None)
+        ops.poly_norm_backward(input_grad, weight_grad, bias_grad, output_grad,
+                               input, weight, eps)
         return input_grad, weight_grad, bias_grad, None

build/torch28-cxx11-cu126-x86_64-linux/activation/rms_norm.py CHANGED Viewed

@@ -26,9 +26,12 @@ class RMSNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
         return input_grad, weight_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input,
+                              weight, eps)
         return input_grad, weight_grad, None

build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_cf68df1_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b1a65b79b750f550a09e6a1142b5151b03b2a60ec6115a264e6d8de3cac7ee5d
-size 4000920

build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_f517c97_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79be6527f579de1133e50a66310d7d0690649dcac63009a54b5e68809408f12a
+size 6634208

build/torch28-cxx11-cu128-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_cf68df1_dirty
-ops = torch.ops._activation_cf68df1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_cf68df1_dirty::{op_name}"

 import torch
+from . import _activation_f517c97_dirty
+ops = torch.ops._activation_f517c97_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_f517c97_dirty::{op_name}"

build/torch28-cxx11-cu128-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .rms_norm import RMSNormFunction
 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
@@ -28,6 +29,7 @@ class PolyNorm(nn.Module):
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

build/torch28-cxx11-cu128-x86_64-linux/activation/poly_norm.py CHANGED Viewed

@@ -26,16 +26,14 @@ class PolyNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        bias_grad = (
-            torch.empty(1, dtype=weight.dtype, device=weight.device)
-            if ctx.needs_input_grad[2]
-            else None
-        )
-        ops.poly_norm_backward(
-            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
-        )
         return input_grad, weight_grad, bias_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (torch.empty(1, dtype=weight.dtype, device=weight.device)
+                     if ctx.needs_input_grad[2] else None)
+        ops.poly_norm_backward(input_grad, weight_grad, bias_grad, output_grad,
+                               input, weight, eps)
         return input_grad, weight_grad, bias_grad, None

build/torch28-cxx11-cu128-x86_64-linux/activation/rms_norm.py CHANGED Viewed

@@ -26,9 +26,12 @@ class RMSNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
         return input_grad, weight_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input,
+                              weight, eps)
         return input_grad, weight_grad, None

build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_cf68df1_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fd38039c3401b0f6a136f1761c7f396f5954f05e16d78ed1600d8325c1221781
-size 4059256

build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_f517c97_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d95e4491d35cb022a6eaa2febbc555f203893f989a4fb1cc483b2632f141869
+size 6687456

build/torch28-cxx11-cu129-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_cf68df1_dirty
-ops = torch.ops._activation_cf68df1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_cf68df1_dirty::{op_name}"

 import torch
+from . import _activation_f517c97_dirty
+ops = torch.ops._activation_f517c97_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_f517c97_dirty::{op_name}"

build/torch28-cxx11-cu129-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .rms_norm import RMSNormFunction
 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
@@ -28,6 +29,7 @@ class PolyNorm(nn.Module):
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

build/torch28-cxx11-cu129-x86_64-linux/activation/poly_norm.py CHANGED Viewed

@@ -26,16 +26,14 @@ class PolyNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        bias_grad = (
-            torch.empty(1, dtype=weight.dtype, device=weight.device)
-            if ctx.needs_input_grad[2]
-            else None
-        )
-        ops.poly_norm_backward(
-            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
-        )
         return input_grad, weight_grad, bias_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (torch.empty(1, dtype=weight.dtype, device=weight.device)
+                     if ctx.needs_input_grad[2] else None)
+        ops.poly_norm_backward(input_grad, weight_grad, bias_grad, output_grad,
+                               input, weight, eps)
         return input_grad, weight_grad, bias_grad, None

build/torch28-cxx11-cu129-x86_64-linux/activation/rms_norm.py CHANGED Viewed

@@ -26,9 +26,12 @@ class RMSNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
         return input_grad, weight_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input,
+                              weight, eps)
         return input_grad, weight_grad, None

build/torch28-cxx11-rocm63-x86_64-linux/activation/_activation_cf68df1_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d8a75fc3e8648bbab973e3021720ed372ec8468f7a28b5b047640fd7198ab369
-size 2647872

build/torch28-cxx11-rocm63-x86_64-linux/activation/_activation_f517c97_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58116124bb2b5d11de2753dd0c30a1e4c84759f18599da7016c791bad37528e9
+size 2899984

build/torch28-cxx11-rocm63-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_cf68df1_dirty
-ops = torch.ops._activation_cf68df1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_cf68df1_dirty::{op_name}"

 import torch
+from . import _activation_f517c97_dirty
+ops = torch.ops._activation_f517c97_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_f517c97_dirty::{op_name}"