TaehyunKimMotif commited on Aug 22

Commit

f517c97

1 Parent(s): a73a0c0

add readme with precommit hooks and applied pre commit to all files

Browse files

Files changed (17) hide show

README.md +45 -0
activation/assert_utils.h +9 -5
activation/atomic_utils.h +38 -31
activation/block_reduce.h +3 -2
activation/dispatch_utils.h +6 -5
activation/rms_norm.cu +42 -59
tests/conftest.py +16 -16
tests/kernels/allclose_default.py +5 -1
tests/kernels/test_poly_norm.py +7 -10
tests/kernels/test_poly_norm_perf.py +8 -7
tests/kernels/test_rms_norm.py +3 -1
tests/kernels/utils.py +16 -13
torch-ext/activation/layers.py +2 -0
torch-ext/activation/poly_norm.py +9 -11
torch-ext/activation/rms_norm.py +6 -3
torch-ext/torch_binding.cpp +9 -6
torch-ext/torch_binding.h +14 -4

README.md CHANGED Viewed

@@ -32,6 +32,7 @@ print(poly_norm(x))
 - Test cases are from the Motif LLM
 - You can reproduce the results with:
 ```bash
 cd tests
 pytest --run-perf --do-plot
@@ -39,3 +40,47 @@ pytest --run-perf --do-plot
 ![PolyNorm Performance](./tests/perf.png)

 - Test cases are from the Motif LLM
 - You can reproduce the results with:
 ```bash
 cd tests
 pytest --run-perf --do-plot
 ![PolyNorm Performance](./tests/perf.png)
+## Pre-commit Hooks
+This project uses [pre-commit](https://pre-commit.com/) to automatically check and format code before commits.
+### Setup
+1. Install pre-commit:
+   ```bash
+   pip install pre-commit
+   ```
+2. Install the git hooks:
+```bash
+   pre-commit install
+   ```
+Once installed, the configured hooks will run automatically on each commit.
+### Included Hooks
+The following tools are run via pre-commit:
+- **[yapf](https://github.com/google/yapf)** – Python code formatter
+- **[typos](https://github.com/crate-ci/typos)** – Spell checker for common typos
+- **[isort](https://github.com/PyCQA/isort)** – Organizes and sorts Python imports
+- **[clang-format](https://clang.llvm.org/docs/ClangFormat.html)** – Formats C++/CUDA code (`--style=file`)
+- **[pymarkdown](https://github.com/jackdewinter/pymarkdown)** – Lints and auto-fixes Markdown files
+- **[actionlint](https://github.com/rhysd/actionlint)** – Validates GitHub Actions workflows
+### Usage
+- Run all checks on the entire codebase:
+   ```bash
+   pre-commit run --all-files
+   ```
+- Run a specific hook (example: isort):
+ ```bash
+   pre-commit run isort --all-files
+   ```

activation/assert_utils.h CHANGED Viewed

@@ -3,12 +3,15 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
-inline void AssertTensorNotNull(const torch::Tensor &tensor, const std::string &name) {
   TORCH_INTERNAL_ASSERT(tensor.defined(), name + " tensor should not be null.");
 }
-inline void AssertTensorShapeEqual(const torch::Tensor &tensor_a, const torch::Tensor &tensor_b,
-  const std::string &name_a, const std::string &name_b) {
   AssertTensorNotNull(tensor_a, name_a);
   AssertTensorNotNull(tensor_b, name_b);
@@ -17,6 +20,7 @@ inline void AssertTensorShapeEqual(const torch::Tensor &tensor_a, const torch::T
   auto tensor_shape_b = tensor_b.sizes();
   TORCH_INTERNAL_ASSERT(tensor_shape_a.equals(tensor_shape_b),
-    "{} tensor shape should be equal to {} tensor shape. (actual: {}, expected: {})",
-    name_a, name_b, tensor_shape_a, tensor_shape_b);
 }

 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
+inline void AssertTensorNotNull(const torch::Tensor &tensor,
+                                const std::string &name) {
   TORCH_INTERNAL_ASSERT(tensor.defined(), name + " tensor should not be null.");
 }
+inline void AssertTensorShapeEqual(const torch::Tensor &tensor_a,
+                                   const torch::Tensor &tensor_b,
+                                   const std::string &name_a,
+                                   const std::string &name_b) {
   AssertTensorNotNull(tensor_a, name_a);
   AssertTensorNotNull(tensor_b, name_b);
   auto tensor_shape_b = tensor_b.sizes();
   TORCH_INTERNAL_ASSERT(tensor_shape_a.equals(tensor_shape_b),
+                        "{} tensor shape should be equal to {} tensor shape. "
+                        "(actual: {}, expected: {})",
+                        name_a, name_b, tensor_shape_a, tensor_shape_b);
 }

activation/atomic_utils.h CHANGED Viewed

@@ -1,35 +1,38 @@
 #pragma once
-#include <cuda.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/Half.h>
 namespace motif {
-template<typename scalar_t, typename acc_t>
-__device__ inline void atomic_add(scalar_t* address, acc_t value) {
   // TODO: change assert to a static_assert if possible
-	assert(false && "Unsupported type for atomic_add");
 }
-template<>
-__device__ inline void atomic_add<float, float>(float* address, float value) {
-	atomicAdd(address, value);
 }
-template<>
-__device__ inline void atomic_add<double, double>(double* address, double value) {
-	atomicAdd(address, value);
 }
-template<>
-__device__ inline void atomic_add<c10::BFloat16, float>(c10::BFloat16* _address, float value) {
-  volatile c10::BFloat16* address = const_cast<volatile c10::BFloat16*>(_address);
   size_t offset = (size_t)address & 0x2;
-  volatile uint16_t* address_as_short =
-      reinterpret_cast<volatile uint16_t*>(reinterpret_cast<volatile char*>(address));
-  volatile uint32_t* address_as_uint =
-      reinterpret_cast<volatile uint*>(reinterpret_cast<volatile char*>(address) - offset);
   bool is_32bit_aligned = offset == 0;
   uint32_t current = address_as_uint[0];
@@ -39,21 +42,24 @@ __device__ inline void atomic_add<c10::BFloat16, float>(c10::BFloat16* _address,
     expected = current;
     c10::BFloat16 current_bf16(address_as_short[0], c10::BFloat16::from_bits());
     c10::BFloat16 next_bf16 = current_bf16 + value;
-    uint32_t next = is_32bit_aligned ? (current & 0xffff0000) | next_bf16.x
-                                     : (current & 0x0000ffff) | (next_bf16.x << 16);
-    current = atomicCAS(const_cast<uint32_t*>(address_as_uint), expected, next);
   } while (current != expected);
 }
-template<>
-__device__ inline void atomic_add<c10::Half, float>(c10::Half* _address, float value) {
-  volatile c10::Half* address = const_cast<volatile c10::Half*>(_address);
   size_t offset = (size_t)address & 0x2;
-  volatile uint16_t* address_as_short =
-      reinterpret_cast<volatile uint16_t*>(reinterpret_cast<volatile char*>(address));
-  volatile uint32_t* address_as_uint =
-      reinterpret_cast<volatile uint*>(reinterpret_cast<volatile char*>(address) - offset);
   bool is_32bit_aligned = offset == 0;
   uint32_t current = address_as_uint[0];
@@ -63,11 +69,12 @@ __device__ inline void atomic_add<c10::Half, float>(c10::Half* _address, float v
     expected = current;
     c10::Half current_half(address_as_short[0], c10::Half::from_bits());
     c10::Half next_half = current_half + value;
-    uint32_t next = is_32bit_aligned ? (current & 0xffff0000) | next_half.x
-                                     : (current & 0x0000ffff) | (next_half.x << 16);
-    current = atomicCAS(const_cast<uint32_t*>(address_as_uint), expected, next);
   } while (current != expected);
 }
 } // namespace motif

 #pragma once
 #include <c10/util/BFloat16.h>
 #include <c10/util/Half.h>
+#include <cuda.h>
 namespace motif {
+template <typename scalar_t, typename acc_t>
+__device__ inline void atomic_add(scalar_t *address, acc_t value) {
   // TODO: change assert to a static_assert if possible
+  assert(false && "Unsupported type for atomic_add");
 }
+template <>
+__device__ inline void atomic_add<float, float>(float *address, float value) {
+  atomicAdd(address, value);
 }
+template <>
+__device__ inline void atomic_add<double, double>(double *address,
+                                                  double value) {
+  atomicAdd(address, value);
 }
+template <>
+__device__ inline void atomic_add<c10::BFloat16, float>(c10::BFloat16 *_address,
+                                                        float value) {
+  volatile c10::BFloat16 *address =
+      const_cast<volatile c10::BFloat16 *>(_address);
   size_t offset = (size_t)address & 0x2;
+  volatile uint16_t *address_as_short = reinterpret_cast<volatile uint16_t *>(
+      reinterpret_cast<volatile char *>(address));
+  volatile uint32_t *address_as_uint = reinterpret_cast<volatile uint *>(
+      reinterpret_cast<volatile char *>(address) - offset);
   bool is_32bit_aligned = offset == 0;
   uint32_t current = address_as_uint[0];
     expected = current;
     c10::BFloat16 current_bf16(address_as_short[0], c10::BFloat16::from_bits());
     c10::BFloat16 next_bf16 = current_bf16 + value;
+    uint32_t next = is_32bit_aligned
+                        ? (current & 0xffff0000) | next_bf16.x
+                        : (current & 0x0000ffff) | (next_bf16.x << 16);
+    current =
+        atomicCAS(const_cast<uint32_t *>(address_as_uint), expected, next);
   } while (current != expected);
 }
+template <>
+__device__ inline void atomic_add<c10::Half, float>(c10::Half *_address,
+                                                    float value) {
+  volatile c10::Half *address = const_cast<volatile c10::Half *>(_address);
   size_t offset = (size_t)address & 0x2;
+  volatile uint16_t *address_as_short = reinterpret_cast<volatile uint16_t *>(
+      reinterpret_cast<volatile char *>(address));
+  volatile uint32_t *address_as_uint = reinterpret_cast<volatile uint *>(
+      reinterpret_cast<volatile char *>(address) - offset);
   bool is_32bit_aligned = offset == 0;
   uint32_t current = address_as_uint[0];
     expected = current;
     c10::Half current_half(address_as_short[0], c10::Half::from_bits());
     c10::Half next_half = current_half + value;
+    uint32_t next = is_32bit_aligned
+                        ? (current & 0xffff0000) | next_half.x
+                        : (current & 0x0000ffff) | (next_half.x << 16);
+    current =
+        atomicCAS(const_cast<uint32_t *>(address_as_uint), expected, next);
   } while (current != expected);
 }
 } // namespace motif

activation/block_reduce.h CHANGED Viewed

@@ -1,7 +1,8 @@
 namespace motif {
 template <typename acc_t, int BLOCK_SIZE>
-__device__ acc_t _block_reduce_sum(acc_t* shared, const float val, const int d) {
   // TODO: Optimize with warp-level primitives
   __syncthreads();
@@ -17,4 +18,4 @@ __device__ acc_t _block_reduce_sum(acc_t* shared, const float val, const int d)
   return shared[0];
 }
-} // motif

 namespace motif {
 template <typename acc_t, int BLOCK_SIZE>
+__device__ acc_t _block_reduce_sum(acc_t *shared, const float val,
+                                   const int d) {
   // TODO: Optimize with warp-level primitives
   __syncthreads();
   return shared[0];
 }
+} // namespace motif

activation/dispatch_utils.h CHANGED Viewed

@@ -6,10 +6,11 @@
 #include <torch/all.h>
-#define MOTIF_DISPATCH_CASE_FLOATING_TYPES(...)         \
-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
-  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
-#define MOTIF_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(TYPE, NAME, MOTIF_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

 #include <torch/all.h>
+#define MOTIF_DISPATCH_CASE_FLOATING_TYPES(...)                                \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)                          \
   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+#define MOTIF_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                         \
+  AT_DISPATCH_SWITCH(TYPE, NAME,                                               \
+                     MOTIF_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

activation/rms_norm.cu CHANGED Viewed

@@ -1,26 +1,23 @@
-#include <ATen/cuda/CUDAContext.h>
 #include <ATen/Functions.h>
-#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <cmath>
-#include "cuda_compat.h"
-#include "dispatch_utils.h"
 #include "assert_utils.h"
 #include "atomic_utils.h"
 #include "block_reduce.h"
 namespace motif {
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
-__global__ void rms_norm_kernel(
-    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., d]
-    const scalar_t* __restrict__ weight, // [d]
-    const float eps,
-    const int d
-    ) {
   const int64_t token_idx = blockIdx.x;
   const int64_t vec_idx = threadIdx.x;
@@ -44,15 +41,13 @@ __global__ void rms_norm_kernel(
 }
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
-__global__ void rms_norm_backward_kernel(
-    scalar_t* __restrict__ input_grad,         // [..., d]
-    acc_t* __restrict__ temp_weight_grad,      // [..., d]
-    const scalar_t* __restrict__ output_grad,  // [..., d]
-    const scalar_t* __restrict__ input,        // [..., d]
-    const scalar_t* __restrict__ weight,       // [d]
-    const float eps,
-    const int d
-    ) {
   const int64_t token_idx = blockIdx.x;
   const int64_t vec_idx = threadIdx.x;
   acc_t d_sum = 0.0f;
@@ -80,8 +75,7 @@ __global__ void rms_norm_backward_kernel(
     acc_t dy = output_grad[token_idx * d + idx];
     acc_t w = weight[idx];
-    input_grad[token_idx * d + idx] =
-      scale * dy * w - dxx * x;
     if (temp_weight_grad) {
       temp_weight_grad[token_idx * d + idx] = dy * x * scale;
@@ -89,14 +83,12 @@ __global__ void rms_norm_backward_kernel(
   }
 }
-}  // namespace motif
-void rms_norm(torch::Tensor& out,           // [..., d]
-               const torch::Tensor& input,  // [..., d]
-               const torch::Tensor& weight, // [d]
-               double eps)
-{
   AssertTensorShapeEqual(input, out, "input", "out");
   AssertTensorNotNull(weight, "weight");
   // TODO shape check
@@ -110,25 +102,20 @@ void rms_norm(torch::Tensor& out,           // [..., d]
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  MOTIF_DISPATCH_FLOATING_TYPES(
-    input.scalar_type(), "rms_norm_kernel", [&] {
-      motif::rms_norm_kernel<scalar_t, float, BLOCK_SIZE>
-        <<<grid, block, 0, stream>>>(
-          out.data_ptr<scalar_t>(),
-          input.data_ptr<scalar_t>(),
-          weight.data_ptr<scalar_t>(),
-          eps, d);
-    }
-  );
 }
-void rms_norm_backward(
-  torch::Tensor& input_grad,        // [..., d]
-  torch::Tensor& weight_grad,       // [..., d]
-  const torch::Tensor& output_grad, // [d]
-  const torch::Tensor& input,       // [d]
-  const torch::Tensor& weight,      // [d]
-  double eps) {
   AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
   AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
   AssertTensorNotNull(weight, "weight");
@@ -143,24 +130,20 @@ void rms_norm_backward(
   dim3 block(BLOCK_SIZE);
   torch::Tensor temp_weight_grad =
-    torch::empty({num_tokens, d},
-    input.options().dtype(torch::kFloat));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   MOTIF_DISPATCH_FLOATING_TYPES(
-    input.scalar_type(), "rms_norm_backward_kernel", [&] {
-      motif::rms_norm_backward_kernel<scalar_t, float, BLOCK_SIZE>
-        <<<grid, block, 0, stream>>>(
-          input_grad.data_ptr<scalar_t>(),
-          temp_weight_grad.data_ptr<float>(),
-          output_grad.data_ptr<scalar_t>(),
-          input.data_ptr<scalar_t>(),
-          weight.data_ptr<scalar_t>(),
-          eps, d);
-    }
-  );
   if (weight_grad.defined()) {
     at::sum_out(weight_grad, temp_weight_grad, {0});

 #include <ATen/Functions.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
 #include <cmath>
 #include "assert_utils.h"
 #include "atomic_utils.h"
 #include "block_reduce.h"
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
 namespace motif {
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void rms_norm_kernel(scalar_t *__restrict__ out,          // [..., d]
+                                const scalar_t *__restrict__ input,  // [..., d]
+                                const scalar_t *__restrict__ weight, // [d]
+                                const float eps, const int d) {
   const int64_t token_idx = blockIdx.x;
   const int64_t vec_idx = threadIdx.x;
 }
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void
+rms_norm_backward_kernel(scalar_t *__restrict__ input_grad,        // [..., d]
+                         acc_t *__restrict__ temp_weight_grad,     // [..., d]
+                         const scalar_t *__restrict__ output_grad, // [..., d]
+                         const scalar_t *__restrict__ input,       // [..., d]
+                         const scalar_t *__restrict__ weight,      // [d]
+                         const float eps, const int d) {
   const int64_t token_idx = blockIdx.x;
   const int64_t vec_idx = threadIdx.x;
   acc_t d_sum = 0.0f;
     acc_t dy = output_grad[token_idx * d + idx];
     acc_t w = weight[idx];
+    input_grad[token_idx * d + idx] = scale * dy * w - dxx * x;
     if (temp_weight_grad) {
       temp_weight_grad[token_idx * d + idx] = dy * x * scale;
   }
 }
+} // namespace motif
+void rms_norm(torch::Tensor &out,          // [..., d]
+              const torch::Tensor &input,  // [..., d]
+              const torch::Tensor &weight, // [d]
+              double eps) {
   AssertTensorShapeEqual(input, out, "input", "out");
   AssertTensorNotNull(weight, "weight");
   // TODO shape check
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  MOTIF_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
+    motif::rms_norm_kernel<scalar_t, float, BLOCK_SIZE>
+        <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                     input.data_ptr<scalar_t>(),
+                                     weight.data_ptr<scalar_t>(), eps, d);
+  });
 }
+void rms_norm_backward(torch::Tensor &input_grad,        // [..., d]
+                       torch::Tensor &weight_grad,       // [..., d]
+                       const torch::Tensor &output_grad, // [d]
+                       const torch::Tensor &input,       // [d]
+                       const torch::Tensor &weight,      // [d]
+                       double eps) {
   AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
   AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
   AssertTensorNotNull(weight, "weight");
   dim3 block(BLOCK_SIZE);
   torch::Tensor temp_weight_grad =
+      torch::empty({num_tokens, d}, input.options().dtype(torch::kFloat));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   MOTIF_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "rms_norm_backward_kernel", [&] {
+        motif::rms_norm_backward_kernel<scalar_t, float, BLOCK_SIZE>
+            <<<grid, block, 0, stream>>>(input_grad.data_ptr<scalar_t>(),
+                                         temp_weight_grad.data_ptr<float>(),
+                                         output_grad.data_ptr<scalar_t>(),
+                                         input.data_ptr<scalar_t>(),
+                                         weight.data_ptr<scalar_t>(), eps, d);
+      });
   if (weight_grad.defined()) {
     at::sum_out(weight_grad, temp_weight_grad, {0});

tests/conftest.py CHANGED Viewed

@@ -33,8 +33,7 @@ def plot(perf_results: list[PerfResult]):
             textfont=dict(size=14),
             textposition="outside",
             # width=[bar_width] * len(x_labels),
-        )
-    )
     fig.add_trace(
         go.Bar(
@@ -46,12 +45,12 @@ def plot(perf_results: list[PerfResult]):
             textfont=dict(size=14),
             textposition="outside",
             # width=[bar_width] * len(x_labels),
-        )
-    )
     fig.update_layout(
         title=dict(
-            text="<b>Speedup over torch (higher is better) (MI250, torch 2.7, ROCm 6.3)</b>",
             font=dict(size=24),
         ),
         legend=dict(
@@ -96,12 +95,14 @@ def plot(perf_results: list[PerfResult]):
 def pytest_addoption(parser):
-    parser.addoption(
-        "--run-perf", action="store_true", default=False, help="Run perf tests"
-    )
-    parser.addoption(
-        "--do-plot", action="store_true", default=False, help="Plot performance results"
-    )
 @pytest.fixture
@@ -117,10 +118,10 @@ def pytest_configure(config):
     if DO_PLOT and not run_perf:
         raise ValueError(
             "Cannot plot performance results without running performance tests. "
-            "Please use --run-perf option."
-        )
-    config.addinivalue_line("markers", "perf: mark test as performance-related")
 def pytest_collection_modifyitems(config, items):
@@ -128,8 +129,7 @@ def pytest_collection_modifyitems(config, items):
     skip_perf = pytest.mark.skip(reason="need --run-perf option to run")
     skip_normal = pytest.mark.skip(
-        reason="normal tests skipped when --run-perf is used"
-    )
     for item in items:
         if "perf" in item.keywords and not run_perf:
             item.add_marker(skip_perf)

             textfont=dict(size=14),
             textposition="outside",
             # width=[bar_width] * len(x_labels),
+        ))
     fig.add_trace(
         go.Bar(
             textfont=dict(size=14),
             textposition="outside",
             # width=[bar_width] * len(x_labels),
+        ))
     fig.update_layout(
         title=dict(
+            text=
+            "<b>Speedup over torch (higher is better) (MI250, torch 2.7, ROCm 6.3)</b>",
             font=dict(size=24),
         ),
         legend=dict(
 def pytest_addoption(parser):
+    parser.addoption("--run-perf",
+                     action="store_true",
+                     default=False,
+                     help="Run perf tests")
+    parser.addoption("--do-plot",
+                     action="store_true",
+                     default=False,
+                     help="Plot performance results")
 @pytest.fixture
     if DO_PLOT and not run_perf:
         raise ValueError(
             "Cannot plot performance results without running performance tests. "
+            "Please use --run-perf option.")
+    config.addinivalue_line("markers",
+                            "perf: mark test as performance-related")
 def pytest_collection_modifyitems(config, items):
     skip_perf = pytest.mark.skip(reason="need --run-perf option to run")
     skip_normal = pytest.mark.skip(
+        reason="normal tests skipped when --run-perf is used")
     for item in items:
         if "perf" in item.keywords and not run_perf:
             item.add_marker(skip_perf)

tests/kernels/allclose_default.py CHANGED Viewed

@@ -3,7 +3,11 @@ import torch
 # Reference default values of atol and rtol are from
 # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
 default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
-default_rtol = {torch.float16: 1e-3, torch.bfloat16: 1.6e-2, torch.float: 1.3e-6}
 def get_default_atol(output) -> float:

 # Reference default values of atol and rtol are from
 # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
 default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
+default_rtol = {
+    torch.float16: 1e-3,
+    torch.bfloat16: 1.6e-2,
+    torch.float: 1.3e-6
+}
 def get_default_atol(output) -> float:

tests/kernels/test_poly_norm.py CHANGED Viewed

@@ -13,23 +13,20 @@ DTYPES = [torch.float, torch.bfloat16, torch.half]
 NUM_TOKENS = [7, 13]  # Arbitrary values for testing
 D = [513]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 def norm(x, eps: float) -> torch.Tensor:
     return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
-def poly_norm(
-    x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, eps: float
-) -> torch.Tensor:
     x = x.float()
-    return (
-        weight[0] * norm(x**3, eps)
-        + weight[1] * norm(x**2, eps)
-        + weight[2] * norm(x, eps)
-        + bias
-    ).to(weight.dtype)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)

 NUM_TOKENS = [7, 13]  # Arbitrary values for testing
 D = [513]  # Arbitrary values for testing
 SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 def norm(x, eps: float) -> torch.Tensor:
     return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+def poly_norm(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor,
+              eps: float) -> torch.Tensor:
     x = x.float()
+    return (weight[0] * norm(x**3, eps) + weight[1] * norm(x**2, eps) +
+            weight[2] * norm(x, eps) + bias).to(weight.dtype)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)

tests/kernels/test_poly_norm_perf.py CHANGED Viewed

@@ -94,7 +94,8 @@ def test_poly_norm(
         return start.elapsed_time(end) / NUM_REP
     kernel_time_ms = time_cuda(lambda: layer(x))
-    torch_fn_time = time_cuda(lambda: torch_fn(x_ref, weight_ref, bias_ref, eps))
     PERF_RESULTS.append(
         PerfResult(
@@ -103,11 +104,12 @@ def test_poly_norm(
             dtype=dtype,
             kernel_time_ms=kernel_time_ms,
             torch_time_ms=torch_fn_time,
-        )
-    )
-    kernel_time_ms = time_cuda(lambda: mod_out.backward(out_grad, retain_graph=True))
-    torch_fn_time = time_cuda(lambda: ref_out.backward(out_grad, retain_graph=True))
     PERF_RESULTS.append(
         PerfResult(
@@ -116,5 +118,4 @@ def test_poly_norm(
             dtype=dtype,
             kernel_time_ms=kernel_time_ms,
             torch_time_ms=torch_fn_time,
-        )
-    )

         return start.elapsed_time(end) / NUM_REP
     kernel_time_ms = time_cuda(lambda: layer(x))
+    torch_fn_time = time_cuda(
+        lambda: torch_fn(x_ref, weight_ref, bias_ref, eps))
     PERF_RESULTS.append(
         PerfResult(
             dtype=dtype,
             kernel_time_ms=kernel_time_ms,
             torch_time_ms=torch_fn_time,
+        ))
+    kernel_time_ms = time_cuda(
+        lambda: mod_out.backward(out_grad, retain_graph=True))
+    torch_fn_time = time_cuda(
+        lambda: ref_out.backward(out_grad, retain_graph=True))
     PERF_RESULTS.append(
         PerfResult(
             dtype=dtype,
             kernel_time_ms=kernel_time_ms,
             torch_time_ms=torch_fn_time,
+        ))

tests/kernels/test_rms_norm.py CHANGED Viewed

@@ -13,7 +13,9 @@ DTYPES = [torch.float, torch.bfloat16, torch.half]
 NUM_TOKENS = [7, 13]  # Arbitrary values for testing
 D = [513]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)

 NUM_TOKENS = [7, 13]  # Arbitrary values for testing
 D = [513]  # Arbitrary values for testing
 SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)

tests/kernels/utils.py CHANGED Viewed

@@ -46,15 +46,19 @@ def fp8_allclose(
     """
     Reference implementation of torch.allclose
     """
-    torch._refs._check_close_args(name="torch.allclose", a=a, b=b, rtol=rtol, atol=atol)
     return bool(
         torch.all(
-            torch.isclose(
-                a.double(), b.double(), rtol=rtol, atol=atol, equal_nan=equal_nan
-            )
-        ).item()
-    )
 # A special version of op check that has a restricted default set of test_utils
@@ -73,10 +77,9 @@ def opcheck(
     cond: bool = True,
 ) -> Dict[str, str]:
     with unittest.mock.patch("torch.allclose", new=fp8_allclose):
-        return (
-            torch.library.opcheck(
-                op, args, kwargs, test_utils=test_utils, raise_exception=raise_exception
-            )
-            if cond
-            else {}
-        )

     """
     Reference implementation of torch.allclose
     """
+    torch._refs._check_close_args(name="torch.allclose",
+                                  a=a,
+                                  b=b,
+                                  rtol=rtol,
+                                  atol=atol)
     return bool(
         torch.all(
+            torch.isclose(a.double(),
+                          b.double(),
+                          rtol=rtol,
+                          atol=atol,
+                          equal_nan=equal_nan)).item())
 # A special version of op check that has a restricted default set of test_utils
     cond: bool = True,
 ) -> Dict[str, str]:
     with unittest.mock.patch("torch.allclose", new=fp8_allclose):
+        return (torch.library.opcheck(op,
+                                      args,
+                                      kwargs,
+                                      test_utils=test_utils,
+                                      raise_exception=raise_exception)
+                if cond else {})

torch-ext/activation/layers.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .rms_norm import RMSNormFunction
 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
@@ -28,6 +29,7 @@ class PolyNorm(nn.Module):
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

 class PolyNorm(nn.Module):
     def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))

torch-ext/activation/poly_norm.py CHANGED Viewed

@@ -26,16 +26,14 @@ class PolyNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        bias_grad = (
-            torch.empty(1, dtype=weight.dtype, device=weight.device)
-            if ctx.needs_input_grad[2]
-            else None
-        )
-        ops.poly_norm_backward(
-            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
-        )
         return input_grad, weight_grad, bias_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (torch.empty(1, dtype=weight.dtype, device=weight.device)
+                     if ctx.needs_input_grad[2] else None)
+        ops.poly_norm_backward(input_grad, weight_grad, bias_grad, output_grad,
+                               input, weight, eps)
         return input_grad, weight_grad, bias_grad, None

torch-ext/activation/rms_norm.py CHANGED Viewed

@@ -26,9 +26,12 @@ class RMSNormFunction(torch.autograd.Function):
         input, weight = ctx.saved_tensors
         eps = ctx.eps
-        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
-        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
-        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
         return input_grad, weight_grad, None

         input, weight = ctx.saved_tensors
         eps = ctx.eps
+        input_grad = torch.empty_like(
+            input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(
+            weight) if ctx.needs_input_grad[1] else None
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input,
+                              weight, eps)
         return input_grad, weight_grad, None

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -5,18 +5,21 @@
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Activation ops
-  ops.def("poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float eps) -> ()");
-  ops.def("poly_norm_backward(Tensor! input_grad, Tensor! weight_grad, Tensor! bias_grad, Tensor output_grad, Tensor input, Tensor weight, float eps) -> ()");
   ops.impl("poly_norm", torch::kCUDA, &poly_norm);
   ops.impl("poly_norm_backward", torch::kCUDA, &poly_norm_backward);
   // Activation ops
-  ops.def("rms_norm(Tensor! out, Tensor input, Tensor weight, float eps) -> ()");
-  ops.def("rms_norm_backward(Tensor! input_grad, Tensor! weight_grad, Tensor output_grad, Tensor input, Tensor weight, float eps) -> ()");
   ops.impl("rms_norm", torch::kCUDA, &rms_norm);
   ops.impl("rms_norm_backward", torch::kCUDA, &rms_norm_backward);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Activation ops
+  ops.def("poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, "
+          "float eps) -> ()");
+  ops.def("poly_norm_backward(Tensor! input_grad, Tensor! weight_grad, Tensor! "
+          "bias_grad, Tensor output_grad, Tensor input, Tensor weight, float "
+          "eps) -> ()");
   ops.impl("poly_norm", torch::kCUDA, &poly_norm);
   ops.impl("poly_norm_backward", torch::kCUDA, &poly_norm_backward);
   // Activation ops
+  ops.def(
+      "rms_norm(Tensor! out, Tensor input, Tensor weight, float eps) -> ()");
+  ops.def("rms_norm_backward(Tensor! input_grad, Tensor! weight_grad, Tensor "
+          "output_grad, Tensor input, Tensor weight, float eps) -> ()");
   ops.impl("rms_norm", torch::kCUDA, &rms_norm);
   ops.impl("rms_norm_backward", torch::kCUDA, &rms_norm_backward);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h CHANGED Viewed

@@ -2,8 +2,18 @@
 #include <torch/torch.h>
-void poly_norm(torch::Tensor &out, const torch::Tensor &input, const torch::Tensor &weights, const torch::Tensor &bias, double eps);
-void poly_norm_backward(torch::Tensor& input_grad, torch::Tensor& weight_grad, torch::Tensor& bias_grad, const torch::Tensor& output_grad, const torch::Tensor& input, const torch::Tensor& weight, double eps);
-void rms_norm(torch::Tensor &out, const torch::Tensor &input, const torch::Tensor &weights, double eps);
-void rms_norm_backward(torch::Tensor& input_grad, torch::Tensor& weight_grad, const torch::Tensor& output_grad, const torch::Tensor& input, const torch::Tensor& weight, double eps);

 #include <torch/torch.h>
+void poly_norm(torch::Tensor &out, const torch::Tensor &input,
+               const torch::Tensor &weights, const torch::Tensor &bias,
+               double eps);
+void poly_norm_backward(torch::Tensor &input_grad, torch::Tensor &weight_grad,
+                        torch::Tensor &bias_grad,
+                        const torch::Tensor &output_grad,
+                        const torch::Tensor &input, const torch::Tensor &weight,
+                        double eps);
+void rms_norm(torch::Tensor &out, const torch::Tensor &input,
+              const torch::Tensor &weights, double eps);
+void rms_norm_backward(torch::Tensor &input_grad, torch::Tensor &weight_grad,
+                       const torch::Tensor &output_grad,
+                       const torch::Tensor &input, const torch::Tensor &weight,
+                       double eps);