optimize poly norm kernel

Files changed (3) hide show

activation/cuda_compat.h +9 -7
activation/poly_norm.cu +465 -156
activation/poly_norm_naive.cu +0 -246

activation/cuda_compat.h CHANGED Viewed

@@ -1,18 +1,20 @@
 #pragma once
-#ifdef USE_ROCM
-  #include <hip/hip_runtime.h>
 #endif
 #ifndef USE_ROCM
-  #define WARP_SIZE 32
 #else
-  #define WARP_SIZE warpSize
 #endif
 #ifndef USE_ROCM
-  #define VLLM_LDG(arg) __ldg(arg)
 #else
-  #define VLLM_LDG(arg) *(arg)
 #endif

 #pragma once
+#ifndef USE_ROCM
+#include <cub/cub.cuh>
+#else
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
 #endif
 #ifndef USE_ROCM
+#define WARP_SIZE 32
 #else
+#define WARP_SIZE warpSize
 #endif
 #ifndef USE_ROCM
+#define VLLM_LDG(arg) __ldg(arg)
 #else
+#define VLLM_LDG(arg) *(arg)
 #endif

activation/poly_norm.cu CHANGED Viewed

@@ -1,246 +1,555 @@
-#include <ATen/cuda/CUDAContext.h>
 #include <ATen/Functions.h>
-#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <cmath>
-#include "cuda_compat.h"
-#include "dispatch_utils.h"
 #include "assert_utils.h"
 #include "atomic_utils.h"
 #include "block_reduce.h"
 namespace motif {
-template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
-__global__ void poly_norm_kernel(
-    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., d]
-    const scalar_t* __restrict__ weight, // [3]
-    const scalar_t* __restrict__ bias,   // [1]
-    const float eps,
-    const int d
-    ) {
   const int64_t token_idx = blockIdx.x;
-  acc_t sum = 0.0f;
-  acc_t sum_square = 0.0f;
-  acc_t sum_cube = 0.0f;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    acc_t x = input[token_idx * d + idx];
-    sum += pow(x, 2.0f);
-    sum_square += pow(x, 4.0f);
-    sum_cube += pow(x, 6.0f);
   }
-  __shared__ acc_t shared[BLOCK_SIZE];
-  acc_t mean = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum, d) / d;
-  acc_t mean_square = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_square, d) / d;
-  acc_t mean_cube = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_cube, d) / d;
-  acc_t w0 = weight[0];
-  acc_t w1 = weight[1];
-  acc_t w2 = weight[2];
-  acc_t b = bias[0];
-  acc_t divisor = sqrt(mean + eps);
-  acc_t divisor_square = sqrt(mean_square + eps);
-  acc_t divisor_cube  = sqrt(mean_cube + eps);
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    acc_t x = input[token_idx * d + idx];
-    acc_t x_square = pow(x, 2.0f);
-    acc_t x_cube = pow(x, 3.0f);
-    out[token_idx * d + idx] = w2 * x / divisor +
-                               w1 * x_square / divisor_square +
-                               w0 * x_cube / divisor_cube + b;
   }
 }
-template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
-__global__ void poly_norm_backward_kernel(
-    scalar_t* __restrict__ input_grad,         // [..., d]
-    acc_t* __restrict__ temp_weight_grad,      // [..., 3]
-    const scalar_t* __restrict__ output_grad,  // [..., d]
-    const scalar_t* __restrict__ input,        // [..., d]
-    const scalar_t* __restrict__ weight,       // [3]
-    const float eps,
-    const int d
-    ) {
-  const int64_t token_idx = blockIdx.x;
   acc_t w0 = weight[0];
   acc_t w1 = weight[1];
   acc_t w2 = weight[2];
-  acc_t sum_2 = 0.0f;
-  acc_t sum_4 = 0.0f;
-  acc_t sum_6 = 0.0f;
-  acc_t sum_dx_1 = 0.0f;
-  acc_t sum_dx_2 = 0.0f;
-  acc_t sum_dx_3 = 0.0f;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     acc_t dy = output_grad[token_idx * d + idx];
-    acc_t x_1 = input[token_idx * d + idx];
-    acc_t x_2 = x_1 * x_1;
-    acc_t x_3 = x_2 * x_1;
-    acc_t x_4 = x_2 * x_2;
-    acc_t x_6 = x_3 * x_3;
-    sum_2 += x_2;
-    sum_4 += x_4;
-    sum_6 += x_6;
-    sum_dx_1 += dy * x_1;
-    sum_dx_2 += dy * x_2;
-    sum_dx_3 += dy * x_3;
   }
-  __shared__ acc_t shared[BLOCK_SIZE];
-  acc_t mean_2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_2, d) / d + eps;
-  acc_t mean_4 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_4, d) / d + eps;
-  acc_t mean_6 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_6, d) / d + eps;
-  sum_dx_1 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_1, d);
-  sum_dx_2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_2, d);
-  sum_dx_3 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_3, d);
-  acc_t _mean_2 = powf(mean_2, -1.5);
-  acc_t _mean_4 = powf(mean_4, -1.5);
-  acc_t _mean_6 = powf(mean_6, -1.5);
-  acc_t sq_mean_2 = sqrtf(mean_2);
-  acc_t sq_mean_4 = sqrtf(mean_4);
-  acc_t sq_mean_6 = sqrtf(mean_6);
   acc_t sum_dw0 = 0;
   acc_t sum_dw1 = 0;
   acc_t sum_dw2 = 0;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     acc_t dy = output_grad[token_idx * d + idx];
-    acc_t x_1 = input[token_idx * d + idx];
-    acc_t x_2 = x_1 * x_1;
-    acc_t x_3 = x_2 * x_1;
-    acc_t dx_3 =
-      _mean_6 * 3 * x_2 * (dy * mean_6 - x_3 * sum_dx_3 / d) * w0;
-    acc_t dx_2 =
-      _mean_4 * 2 * x_1 * (dy * mean_4 - x_2 * sum_dx_2 / d) * w1;
-    acc_t dx_1 =
-      _mean_2 * (dy * mean_2 - x_1 * sum_dx_1 / d) * w2;
     if (input_grad) {
-      input_grad[token_idx * d + idx] = dx_1 + dx_2 + dx_3;
     }
-    sum_dw0 += dy * (x_3 / sq_mean_6);
-    sum_dw1 += dy * (x_2 / sq_mean_4);
-    sum_dw2 += dy * (x_1 / sq_mean_2);
   }
-  if (temp_weight_grad) {
-    sum_dw0 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw0, d);
-    sum_dw1 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw1, d);
-    sum_dw2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw2, d);
-    if (threadIdx.x == 0) {
-      temp_weight_grad[token_idx * 3 + 0] = sum_dw0;
-      temp_weight_grad[token_idx * 3 + 1] = sum_dw1;
-      temp_weight_grad[token_idx * 3 + 2] = sum_dw2;
-    }
   }
 }
-}  // namespace motif
-void poly_norm(torch::Tensor& out,          // [..., d]
-               const torch::Tensor& input,  // [..., d]
-               const torch::Tensor& weight, // [3]
-               const torch::Tensor& bias,   // [1]
-               double eps)
-{
   AssertTensorShapeEqual(input, out, "input", "out");
   AssertTensorNotNull(weight, "weight");
   AssertTensorNotNull(bias, "bias");
   // TODO shape check
-  constexpr int BLOCK_SIZE = 256;
   int d = input.size(-1);
-  int64_t num_tokens = input.numel() / input.size(-1);
   dim3 grid(num_tokens);
-  dim3 block(BLOCK_SIZE);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  MOTIF_DISPATCH_FLOATING_TYPES(
-    input.scalar_type(), "poly_norm_kernel", [&] {
-      motif::poly_norm_kernel<scalar_t, float, BLOCK_SIZE>
-        <<<grid, block, 0, stream>>>(
-          out.data_ptr<scalar_t>(),
-          input.data_ptr<scalar_t>(),
-          weight.data_ptr<scalar_t>(),
-          bias.data_ptr<scalar_t>(), eps, d);
-    }
-  );
 }
-void poly_norm_backward(
-  torch::Tensor& input_grad,        // [..., d]
-  torch::Tensor& weight_grad,       // [..., d]
-  torch::Tensor& bias_grad,         // [..., d]
-  const torch::Tensor& output_grad, // [3]
-  const torch::Tensor& input,       // [3]
-  const torch::Tensor& weight,      // [3]
-  double eps) {
   AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
   AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
   AssertTensorNotNull(weight, "weight");
   // TODO shape check
   // weight_grad, bias_grad and input_grad can be nullable
-  constexpr int BLOCK_SIZE = 256;
   int d = input.size(-1);
-  int64_t num_tokens = input.numel() / input.size(-1);
   dim3 grid(num_tokens);
-  dim3 block(BLOCK_SIZE);
   torch::Tensor temp_weight_grad =
-    torch::empty({num_tokens, 3},
-    input.options().dtype(torch::kFloat));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  MOTIF_DISPATCH_FLOATING_TYPES(
-    input.scalar_type(), "poly_norm_backward_kernel", [&] {
-      motif::poly_norm_backward_kernel<scalar_t, float, BLOCK_SIZE>
-        <<<grid, block, 0, stream>>>(
-          input_grad.data_ptr<scalar_t>(),
-          temp_weight_grad.data_ptr<float>(),
-          output_grad.data_ptr<scalar_t>(),
-          input.data_ptr<scalar_t>(),
-          weight.data_ptr<scalar_t>(),
-          eps, d);
-    }
-  );
   if (bias_grad.defined()) {
-    at::sum_out(bias_grad, output_grad);
-    bias_grad.resize_({1});
   }
   if (weight_grad.defined()) {
-    at::sum_out(weight_grad, temp_weight_grad, {0});
   }
 }

 #include <ATen/Functions.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
 #include <cmath>
 #include "assert_utils.h"
 #include "atomic_utils.h"
 #include "block_reduce.h"
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
 namespace motif {
+template <typename type, int N> struct alignas(sizeof(type) * N) type_vec_t {
+  type data[N];
+};
+template <typename scalar_t, typename acc_t, int width>
+__global__ std::enable_if_t<(width > 0)>
+poly_norm_kernel(scalar_t *__restrict__ out,          // [..., d]
+                 const scalar_t *__restrict__ input,  // [..., d]
+                 const scalar_t *__restrict__ weight, // [3]
+                 const scalar_t *__restrict__ bias,   // [1]
+                 const float eps, const int d) {
+  using vec_t = type_vec_t<scalar_t, width>;
+  const int vec_d = d / width;
+  const int64_t vec_offset = blockIdx.x * vec_d;
+  const vec_t *__restrict__ input_vec = reinterpret_cast<const vec_t *>(input);
+  acc_t sum2 = 0.0f;
+  acc_t sum4 = 0.0f;
+  acc_t sum6 = 0.0f;
+  for (int64_t idx = threadIdx.x; idx < vec_d; idx += blockDim.x) {
+    vec_t x_vec = input_vec[vec_offset + idx];
+#pragma unroll
+    for (int i = 0; i < width; ++i) {
+      acc_t x1 = static_cast<acc_t>(x_vec.data[i]);
+      acc_t x2 = x1 * x1;
+      acc_t x4 = x2 * x2;
+      acc_t x6 = x4 * x2;
+      sum2 += x2;
+      sum4 += x4;
+      sum6 += x6;
+    }
+  }
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  sum2 = BlockReduce(reduceStore).Sum(sum2, blockDim.x);
+  __syncthreads();
+  sum4 = BlockReduce(reduceStore).Sum(sum4, blockDim.x);
+  __syncthreads();
+  sum6 = BlockReduce(reduceStore).Sum(sum6, blockDim.x);
+  __shared__ acc_t s_bias;
+  __shared__ acc_t s_w2_inv_std1;
+  __shared__ acc_t s_w1_inv_std2;
+  __shared__ acc_t s_w0_inv_std3;
+  if (threadIdx.x == 0) {
+    acc_t w0 = weight[0];
+    acc_t w1 = weight[1];
+    acc_t w2 = weight[2];
+    s_bias = bias[0];
+    s_w2_inv_std1 = rsqrtf(sum2 / d + eps) * w2;
+    s_w1_inv_std2 = rsqrtf(sum4 / d + eps) * w1;
+    s_w0_inv_std3 = rsqrtf(sum6 / d + eps) * w0;
+  }
+  __syncthreads();
+  acc_t w2_inv_std1 = s_w2_inv_std1;
+  acc_t w1_inv_std2 = s_w1_inv_std2;
+  acc_t w0_inv_std3 = s_w0_inv_std3;
+  acc_t bias_reg = s_bias;
+  vec_t *__restrict__ output_vec = reinterpret_cast<vec_t *>(out);
+  for (int64_t idx = threadIdx.x; idx < vec_d; idx += blockDim.x) {
+    vec_t x_vec = input_vec[vec_offset + idx];
+    vec_t y_vec;
+#pragma unroll
+    for (int i = 0; i < width; ++i) {
+      acc_t x1 = static_cast<acc_t>(x_vec.data[i]);
+      acc_t x2 = x1 * x1;
+      acc_t x3 = x2 * x1;
+      acc_t y =
+          x1 * w2_inv_std1 + x2 * w1_inv_std2 + x3 * w0_inv_std3 + bias_reg;
+      y_vec.data[i] = static_cast<scalar_t>(y);
+    }
+    output_vec[vec_offset + idx] = y_vec;
+  }
+}
+template <typename scalar_t, typename acc_t, int width>
+__global__ std::enable_if_t<(width == 0)>
+poly_norm_kernel(scalar_t *__restrict__ out,          // [..., d]
+                 const scalar_t *__restrict__ input,  // [..., d]
+                 const scalar_t *__restrict__ weight, // [3]
+                 const scalar_t *__restrict__ bias,   // [1]
+                 const float eps, const int d) {
   const int64_t token_idx = blockIdx.x;
+  acc_t sum2 = 0.0f;
+  acc_t sum4 = 0.0f;
+  acc_t sum6 = 0.0f;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    acc_t x1 = input[token_idx * d + idx];
+    acc_t x2 = x1 * x1;
+    acc_t x4 = x2 * x2;
+    acc_t x6 = x4 * x2;
+    sum2 += x2;
+    sum4 += x4;
+    sum6 += x6;
   }
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  sum2 = BlockReduce(reduceStore).Sum(sum2, blockDim.x);
+  __syncthreads();
+  sum4 = BlockReduce(reduceStore).Sum(sum4, blockDim.x);
+  __syncthreads();
+  sum6 = BlockReduce(reduceStore).Sum(sum6, blockDim.x);
+  __shared__ acc_t s_bias;
+  __shared__ acc_t s_w2_inv_std1;
+  __shared__ acc_t s_w1_inv_std2;
+  __shared__ acc_t s_w0_inv_std3;
+  if (threadIdx.x == 0) {
+    acc_t w0 = weight[0];
+    acc_t w1 = weight[1];
+    acc_t w2 = weight[2];
+    s_bias = bias[0];
+    s_w2_inv_std1 = rsqrtf(sum2 / d + eps) * w2;
+    s_w1_inv_std2 = rsqrtf(sum4 / d + eps) * w1;
+    s_w0_inv_std3 = rsqrtf(sum6 / d + eps) * w0;
+  }
+  __syncthreads();
+  acc_t w2_inv_std1 = s_w2_inv_std1;
+  acc_t w1_inv_std2 = s_w1_inv_std2;
+  acc_t w0_inv_std3 = s_w0_inv_std3;
+  acc_t bias_reg = s_bias;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    acc_t x1 = input[token_idx * d + idx];
+    acc_t x2 = x1 * x1;
+    acc_t x3 = x2 * x1;
+    out[token_idx * d + idx] =
+        x1 * w2_inv_std1 + x2 * w1_inv_std2 + x3 * w0_inv_std3 + bias_reg;
   }
 }
+template <typename scalar_t, typename acc_t, int width>
+__global__ std::enable_if_t<(width > 0)>
+poly_norm_backward_kernel(scalar_t *__restrict__ input_grad,        // [..., d]
+                          acc_t *__restrict__ temp_weight_grad,     // [..., 3]
+                          acc_t *__restrict__ temp_bias_grad,       // [..., 1]
+                          const scalar_t *__restrict__ output_grad, // [..., d]
+                          const scalar_t *__restrict__ input,       // [..., d]
+                          const scalar_t *__restrict__ weight,      // [3]
+                          const float eps, const int d) {
+  using vec_t = type_vec_t<scalar_t, width>;
+  const int vec_d = d / width;
+  const int64_t vec_offset = blockIdx.x * vec_d;
+  const vec_t *__restrict__ input_vec = reinterpret_cast<const vec_t *>(input);
+  const vec_t *__restrict__ output_grad_vec =
+      reinterpret_cast<const vec_t *>(output_grad);
+  acc_t sum2 = 0.0f;
+  acc_t sum4 = 0.0f;
+  acc_t sum6 = 0.0f;
+  acc_t sum_dx1 = 0.0f;
+  acc_t sum_dx2 = 0.0f;
+  acc_t sum_dx3 = 0.0f;
+  for (int64_t idx = threadIdx.x; idx < vec_d; idx += blockDim.x) {
+    vec_t x_vec = input_vec[vec_offset + idx];
+    vec_t dy_vec = output_grad_vec[vec_offset + idx];
+#pragma unroll
+    for (int i = 0; i < width; ++i) {
+      acc_t x1 = static_cast<acc_t>(x_vec.data[i]);
+      acc_t x2 = x1 * x1;
+      acc_t x3 = x2 * x1;
+      acc_t x4 = x2 * x2;
+      acc_t x6 = x3 * x3;
+      sum2 += x2;
+      sum4 += x4;
+      sum6 += x6;
+      acc_t dy = static_cast<acc_t>(dy_vec.data[i]);
+      sum_dx1 += dy * x1;
+      sum_dx2 += dy * x2;
+      sum_dx3 += dy * x3;
+    }
+  }
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  __syncthreads();
+  sum2 = BlockReduce(reduceStore).Sum(sum2, blockDim.x);
+  __syncthreads();
+  sum4 = BlockReduce(reduceStore).Sum(sum4, blockDim.x);
+  __syncthreads();
+  sum6 = BlockReduce(reduceStore).Sum(sum6, blockDim.x);
+  __syncthreads();
+  sum_dx1 = BlockReduce(reduceStore).Sum(sum_dx1, blockDim.x);
+  __syncthreads();
+  sum_dx2 = BlockReduce(reduceStore).Sum(sum_dx2, blockDim.x);
+  __syncthreads();
+  sum_dx3 = BlockReduce(reduceStore).Sum(sum_dx3, blockDim.x);
+  __shared__ acc_t s_mean2;
+  __shared__ acc_t s_mean4;
+  __shared__ acc_t s_mean6;
+  __shared__ acc_t s_sdx1;
+  __shared__ acc_t s_sdx2;
+  __shared__ acc_t s_sdx3;
+  const acc_t inv_d = acc_t(1) / d;
+  if (threadIdx.x == 0) {
+    s_mean2 = sum2 * inv_d + eps;
+    s_mean4 = sum4 * inv_d + eps;
+    s_mean6 = sum6 * inv_d + eps;
+    s_sdx1 = sum_dx1 * inv_d;
+    s_sdx2 = sum_dx2 * inv_d;
+    s_sdx3 = sum_dx3 * inv_d;
+  }
+  __syncthreads();
   acc_t w0 = weight[0];
   acc_t w1 = weight[1];
   acc_t w2 = weight[2];
+  acc_t mean2 = s_mean2;
+  acc_t mean4 = s_mean4;
+  acc_t mean6 = s_mean6;
+  acc_t sdx1 = s_sdx1;
+  acc_t sdx2 = s_sdx2;
+  acc_t sdx3 = s_sdx3;
+  acc_t inv_std1 = rsqrtf(mean2);
+  acc_t inv_std2 = rsqrtf(mean4);
+  acc_t inv_std3 = rsqrtf(mean6);
+  // inv_std / mean == powf(mean, -1.5)
+  acc_t c1 = w2 * inv_std1 / mean2;
+  acc_t c2 = acc_t(2) * w1 * inv_std2 / mean4;
+  acc_t c3 = acc_t(3) * w0 * inv_std3 / mean6;
+  acc_t sum_dy = 0;
+  acc_t sum_dw0 = 0;
+  acc_t sum_dw1 = 0;
+  acc_t sum_dw2 = 0;
+  vec_t *__restrict__ input_grad_vec = reinterpret_cast<vec_t *>(input_grad);
+  for (int64_t idx = threadIdx.x; idx < vec_d; idx += blockDim.x) {
+    vec_t x_vec = input_vec[vec_offset + idx];
+    vec_t dy_vec = output_grad_vec[vec_offset + idx];
+    vec_t dx_vec;
+#pragma unroll
+    for (int i = 0; i < width; ++i) {
+      acc_t x1 = static_cast<acc_t>(x_vec.data[i]);
+      acc_t x2 = x1 * x1;
+      acc_t x3 = x2 * x1;
+      acc_t dy = static_cast<acc_t>(dy_vec.data[i]);
+      if (input_grad) {
+        acc_t dx3 = c3 * x2 * (dy * mean6 - x3 * sdx3);
+        acc_t dx2 = c2 * x1 * (dy * mean4 - x2 * sdx2);
+        acc_t dx1 = c1 * (dy * mean2 - x1 * sdx1);
+        dx_vec.data[i] = static_cast<scalar_t>(dx1 + dx2 + dx3);
+      }
+      sum_dy += dy;
+      sum_dw0 += dy * (x3 * inv_std3);
+      sum_dw1 += dy * (x2 * inv_std2);
+      sum_dw2 += dy * (x1 * inv_std1);
+    }
+    if (input_grad) {
+      input_grad_vec[vec_offset + idx] = dx_vec;
+    }
+  }
+  sum_dy = BlockReduce(reduceStore).Sum(sum_dy, blockDim.x);
+  __syncthreads();
+  sum_dw0 = BlockReduce(reduceStore).Sum(sum_dw0, blockDim.x);
+  __syncthreads();
+  sum_dw1 = BlockReduce(reduceStore).Sum(sum_dw1, blockDim.x);
+  __syncthreads();
+  sum_dw2 = BlockReduce(reduceStore).Sum(sum_dw2, blockDim.x);
+  if (threadIdx.x == 0) {
+    temp_bias_grad[blockIdx.x] = sum_dy;
+    temp_weight_grad[blockIdx.x * 3 + 0] = sum_dw0;
+    temp_weight_grad[blockIdx.x * 3 + 1] = sum_dw1;
+    temp_weight_grad[blockIdx.x * 3 + 2] = sum_dw2;
+  }
+}
+template <typename scalar_t, typename acc_t, int width>
+__global__ std::enable_if_t<(width == 0)>
+poly_norm_backward_kernel(scalar_t *__restrict__ input_grad,        // [..., d]
+                          acc_t *__restrict__ temp_weight_grad,     // [..., 3]
+                          acc_t *__restrict__ temp_bias_grad,       // [..., 1]
+                          const scalar_t *__restrict__ output_grad, // [..., d]
+                          const scalar_t *__restrict__ input,       // [..., d]
+                          const scalar_t *__restrict__ weight,      // [3]
+                          const float eps, const int d) {
+  const int64_t token_idx = blockIdx.x;
+  acc_t sum2 = 0.0f;
+  acc_t sum4 = 0.0f;
+  acc_t sum6 = 0.0f;
+  acc_t sum_dx1 = 0.0f;
+  acc_t sum_dx2 = 0.0f;
+  acc_t sum_dx3 = 0.0f;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     acc_t dy = output_grad[token_idx * d + idx];
+    acc_t x1 = input[token_idx * d + idx];
+    acc_t x2 = x1 * x1;
+    acc_t x3 = x2 * x1;
+    acc_t x4 = x2 * x2;
+    acc_t x6 = x3 * x3;
+    sum2 += x2;
+    sum4 += x4;
+    sum6 += x6;
+    sum_dx1 += dy * x1;
+    sum_dx2 += dy * x2;
+    sum_dx3 += dy * x3;
   }
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  __syncthreads();
+  sum2 = BlockReduce(reduceStore).Sum(sum2, blockDim.x);
+  __syncthreads();
+  sum4 = BlockReduce(reduceStore).Sum(sum4, blockDim.x);
+  __syncthreads();
+  sum6 = BlockReduce(reduceStore).Sum(sum6, blockDim.x);
+  __syncthreads();
+  sum_dx1 = BlockReduce(reduceStore).Sum(sum_dx1, blockDim.x);
+  __syncthreads();
+  sum_dx2 = BlockReduce(reduceStore).Sum(sum_dx2, blockDim.x);
+  __syncthreads();
+  sum_dx3 = BlockReduce(reduceStore).Sum(sum_dx3, blockDim.x);
+  __shared__ acc_t s_mean2;
+  __shared__ acc_t s_mean4;
+  __shared__ acc_t s_mean6;
+  __shared__ acc_t s_sdx1;
+  __shared__ acc_t s_sdx2;
+  __shared__ acc_t s_sdx3;
+  const acc_t inv_d = acc_t(1) / d;
+  if (threadIdx.x == 0) {
+    s_mean2 = sum2 * inv_d + eps;
+    s_mean4 = sum4 * inv_d + eps;
+    s_mean6 = sum6 * inv_d + eps;
+    s_sdx1 = sum_dx1 * inv_d;
+    s_sdx2 = sum_dx2 * inv_d;
+    s_sdx3 = sum_dx3 * inv_d;
+  }
+  __syncthreads();
+  acc_t w0 = weight[0];
+  acc_t w1 = weight[1];
+  acc_t w2 = weight[2];
+  acc_t mean2 = s_mean2;
+  acc_t mean4 = s_mean4;
+  acc_t mean6 = s_mean6;
+  acc_t sdx1 = s_sdx1;
+  acc_t sdx2 = s_sdx2;
+  acc_t sdx3 = s_sdx3;
+  acc_t inv_std1 = rsqrtf(mean2);
+  acc_t inv_std2 = rsqrtf(mean4);
+  acc_t inv_std3 = rsqrtf(mean6);
+  // inv_std / mean == powf(mean, -1.5)
+  acc_t c1 = w2 * inv_std1 / mean2;
+  acc_t c2 = acc_t(2) * w1 * inv_std2 / mean4;
+  acc_t c3 = acc_t(3) * w0 * inv_std3 / mean6;
+  acc_t sum_dy = 0;
   acc_t sum_dw0 = 0;
   acc_t sum_dw1 = 0;
   acc_t sum_dw2 = 0;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     acc_t dy = output_grad[token_idx * d + idx];
+    acc_t x1 = input[token_idx * d + idx];
+    acc_t x2 = x1 * x1;
+    acc_t x3 = x2 * x1;
     if (input_grad) {
+      acc_t dx3 = c3 * x2 * (dy * mean6 - x3 * sdx3);
+      acc_t dx2 = c2 * x1 * (dy * mean4 - x2 * sdx2);
+      acc_t dx1 = c1 * (dy * mean2 - x1 * sdx1);
+      input_grad[token_idx * d + idx] = dx1 + dx2 + dx3;
     }
+    sum_dy += dy;
+    sum_dw0 += dy * (x3 * inv_std3);
+    sum_dw1 += dy * (x2 * inv_std2);
+    sum_dw2 += dy * (x1 * inv_std1);
   }
+  sum_dy = BlockReduce(reduceStore).Sum(sum_dy, blockDim.x);
+  __syncthreads();
+  sum_dw0 = BlockReduce(reduceStore).Sum(sum_dw0, blockDim.x);
+  __syncthreads();
+  sum_dw1 = BlockReduce(reduceStore).Sum(sum_dw1, blockDim.x);
+  __syncthreads();
+  sum_dw2 = BlockReduce(reduceStore).Sum(sum_dw2, blockDim.x);
+  if (threadIdx.x == 0) {
+    temp_bias_grad[token_idx] = sum_dy;
+    temp_weight_grad[token_idx * 3 + 0] = sum_dw0;
+    temp_weight_grad[token_idx * 3 + 1] = sum_dw1;
+    temp_weight_grad[token_idx * 3 + 2] = sum_dw2;
   }
 }
+} // namespace motif
+#define LAUNCH_POLY_NORM(width)                                                \
+  MOTIF_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
+    motif::poly_norm_kernel<scalar_t, float, width>                            \
+        <<<grid, block, 0, stream>>>(                                          \
+            out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),              \
+            weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), eps, d);   \
+  });
+void poly_norm(torch::Tensor &out,          // [..., d]
+               const torch::Tensor &input,  // [..., d]
+               const torch::Tensor &weight, // [3]
+               const torch::Tensor &bias,   // [1]
+               double eps) {
   AssertTensorShapeEqual(input, out, "input", "out");
   AssertTensorNotNull(weight, "weight");
   AssertTensorNotNull(bias, "bias");
   // TODO shape check
   int d = input.size(-1);
+  int64_t num_tokens = input.numel() / d;
   dim3 grid(num_tokens);
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
+  dim3 block(std::min(d, max_block_size));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (d % 8 == 0) {
+    LAUNCH_POLY_NORM(8);
+  } else {
+    LAUNCH_POLY_NORM(0);
+  }
 }
+#define LAUNCH_POLY_NORM_BACKWARD(width)                                       \
+  MOTIF_DISPATCH_FLOATING_TYPES(                                               \
+      input.scalar_type(), "poly_norm_backward_kernel", [&] {                  \
+        motif::poly_norm_backward_kernel<scalar_t, float, width>               \
+            <<<grid, block, 0, stream>>>(input_grad.data_ptr<scalar_t>(),      \
+                                         temp_weight_grad.data_ptr<float>(),   \
+                                         temp_bias_grad.data_ptr<float>(),     \
+                                         output_grad.data_ptr<scalar_t>(),     \
+                                         input.data_ptr<scalar_t>(),           \
+                                         weight.data_ptr<scalar_t>(), eps, d); \
+      });
+void poly_norm_backward(torch::Tensor &input_grad,        // [..., d]
+                        torch::Tensor &weight_grad,       // [3]
+                        torch::Tensor &bias_grad,         // [1]
+                        const torch::Tensor &output_grad, // [..., d]
+                        const torch::Tensor &input,       // [..., d]
+                        const torch::Tensor &weight,      // [3]
+                        double eps) {
   AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
   AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
   AssertTensorNotNull(weight, "weight");
   // TODO shape check
   // weight_grad, bias_grad and input_grad can be nullable
   int d = input.size(-1);
+  int64_t num_tokens = input.numel() / d;
   dim3 grid(num_tokens);
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
+  dim3 block(std::min(d, max_block_size));
   torch::Tensor temp_weight_grad =
+      torch::empty({num_tokens, 3}, input.options().dtype(torch::kFloat));
+  torch::Tensor temp_bias_grad =
+      torch::empty({num_tokens, 1}, output_grad.options().dtype(torch::kFloat));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (d % 8 == 0) {
+    LAUNCH_POLY_NORM_BACKWARD(8);
+  } else {
+    LAUNCH_POLY_NORM_BACKWARD(0);
+  }
   if (bias_grad.defined()) {
+    torch::Tensor acc = torch::empty_like(bias_grad, temp_bias_grad.options());
+    at::sum_out(acc, temp_bias_grad, {0});
+    bias_grad.copy_(acc);
   }
   if (weight_grad.defined()) {
+    torch::Tensor acc =
+        torch::empty_like(weight_grad, temp_weight_grad.options());
+    at::sum_out(acc, temp_weight_grad, {0});
+    weight_grad.copy_(acc);
   }
 }

activation/poly_norm_naive.cu DELETED Viewed

@@ -1,246 +0,0 @@
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/Functions.h>
-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cmath>
-#include "cuda_compat.h"
-#include "dispatch_utils.h"
-#include "assert_utils.h"
-#include "atomic_utils.h"
-#include "block_reduce.h"
-namespace motif {
-template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
-__global__ void poly_norm_naive_kernel(
-    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., d]
-    const scalar_t* __restrict__ weight, // [3]
-    const scalar_t* __restrict__ bias,   // [1]
-    const float eps,
-    const int d
-    ) {
-  const int64_t token_idx = blockIdx.x;
-  acc_t sum = 0.0f;
-  acc_t sum_square = 0.0f;
-  acc_t sum_cube = 0.0f;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    acc_t x = input[token_idx * d + idx];
-    sum += pow(x, 2.0f);
-    sum_square += pow(x, 4.0f);
-    sum_cube += pow(x, 6.0f);
-  }
-  __shared__ acc_t shared[BLOCK_SIZE];
-  acc_t mean = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum, d) / d;
-  acc_t mean_square = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_square, d) / d;
-  acc_t mean_cube = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_cube, d) / d;
-  acc_t w0 = weight[0];
-  acc_t w1 = weight[1];
-  acc_t w2 = weight[2];
-  acc_t b = bias[0];
-  acc_t divisor = sqrt(mean + eps);
-  acc_t divisor_square = sqrt(mean_square + eps);
-  acc_t divisor_cube  = sqrt(mean_cube + eps);
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    acc_t x = input[token_idx * d + idx];
-    acc_t x_square = pow(x, 2.0f);
-    acc_t x_cube = pow(x, 3.0f);
-    out[token_idx * d + idx] = w2 * x / divisor +
-                               w1 * x_square / divisor_square +
-                               w0 * x_cube / divisor_cube + b;
-  }
-}
-template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
-__global__ void poly_norm_naive_backward_kernel(
-    scalar_t* __restrict__ input_grad,         // [..., d]
-    acc_t* __restrict__ temp_weight_grad,      // [..., 3]
-    const scalar_t* __restrict__ output_grad,  // [..., d]
-    const scalar_t* __restrict__ input,        // [..., d]
-    const scalar_t* __restrict__ weight,       // [3]
-    const float eps,
-    const int d
-    ) {
-  const int64_t token_idx = blockIdx.x;
-  acc_t w0 = weight[0];
-  acc_t w1 = weight[1];
-  acc_t w2 = weight[2];
-  acc_t sum_2 = 0.0f;
-  acc_t sum_4 = 0.0f;
-  acc_t sum_6 = 0.0f;
-  acc_t sum_dx_1 = 0.0f;
-  acc_t sum_dx_2 = 0.0f;
-  acc_t sum_dx_3 = 0.0f;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    acc_t dy = output_grad[token_idx * d + idx];
-    acc_t x_1 = input[token_idx * d + idx];
-    acc_t x_2 = x_1 * x_1;
-    acc_t x_3 = x_2 * x_1;
-    acc_t x_4 = x_2 * x_2;
-    acc_t x_6 = x_3 * x_3;
-    sum_2 += x_2;
-    sum_4 += x_4;
-    sum_6 += x_6;
-    sum_dx_1 += dy * x_1;
-    sum_dx_2 += dy * x_2;
-    sum_dx_3 += dy * x_3;
-  }
-  __shared__ acc_t shared[BLOCK_SIZE];
-  acc_t mean_2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_2, d) / d + eps;
-  acc_t mean_4 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_4, d) / d + eps;
-  acc_t mean_6 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_6, d) / d + eps;
-  sum_dx_1 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_1, d);
-  sum_dx_2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_2, d);
-  sum_dx_3 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_3, d);
-  acc_t _mean_2 = powf(mean_2, -1.5);
-  acc_t _mean_4 = powf(mean_4, -1.5);
-  acc_t _mean_6 = powf(mean_6, -1.5);
-  acc_t sq_mean_2 = sqrtf(mean_2);
-  acc_t sq_mean_4 = sqrtf(mean_4);
-  acc_t sq_mean_6 = sqrtf(mean_6);
-  acc_t sum_dw0 = 0;
-  acc_t sum_dw1 = 0;
-  acc_t sum_dw2 = 0;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    acc_t dy = output_grad[token_idx * d + idx];
-    acc_t x_1 = input[token_idx * d + idx];
-    acc_t x_2 = x_1 * x_1;
-    acc_t x_3 = x_2 * x_1;
-    acc_t dx_3 =
-      _mean_6 * 3 * x_2 * (dy * mean_6 - x_3 * sum_dx_3 / d) * w0;
-    acc_t dx_2 =
-      _mean_4 * 2 * x_1 * (dy * mean_4 - x_2 * sum_dx_2 / d) * w1;
-    acc_t dx_1 =
-      _mean_2 * (dy * mean_2 - x_1 * sum_dx_1 / d) * w2;
-    if (input_grad) {
-      input_grad[token_idx * d + idx] = dx_1 + dx_2 + dx_3;
-    }
-    sum_dw0 += dy * (x_3 / sq_mean_6);
-    sum_dw1 += dy * (x_2 / sq_mean_4);
-    sum_dw2 += dy * (x_1 / sq_mean_2);
-  }
-  if (temp_weight_grad) {
-    sum_dw0 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw0, d);
-    sum_dw1 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw1, d);
-    sum_dw2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw2, d);
-    if (threadIdx.x == 0) {
-      temp_weight_grad[token_idx * 3 + 0] = sum_dw0;
-      temp_weight_grad[token_idx * 3 + 1] = sum_dw1;
-      temp_weight_grad[token_idx * 3 + 2] = sum_dw2;
-    }
-  }
-}
-}  // namespace motif
-void poly_norm_naive(torch::Tensor& out,          // [..., d]
-               const torch::Tensor& input,  // [..., d]
-               const torch::Tensor& weight, // [3]
-               const torch::Tensor& bias,   // [1]
-               double eps)
-{
-  AssertTensorShapeEqual(input, out, "input", "out");
-  AssertTensorNotNull(weight, "weight");
-  AssertTensorNotNull(bias, "bias");
-  // TODO shape check
-  constexpr int BLOCK_SIZE = 256;
-  int d = input.size(-1);
-  int64_t num_tokens = input.numel() / input.size(-1);
-  dim3 grid(num_tokens);
-  dim3 block(BLOCK_SIZE);
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  MOTIF_DISPATCH_FLOATING_TYPES(
-    input.scalar_type(), "poly_norm_naive_kernel", [&] {
-      motif::poly_norm_naive_kernel<scalar_t, float, BLOCK_SIZE>
-        <<<grid, block, 0, stream>>>(
-          out.data_ptr<scalar_t>(),
-          input.data_ptr<scalar_t>(),
-          weight.data_ptr<scalar_t>(),
-          bias.data_ptr<scalar_t>(), eps, d);
-    }
-  );
-}
-void poly_norm_naive_backward(
-  torch::Tensor& input_grad,        // [..., d]
-  torch::Tensor& weight_grad,       // [..., d]
-  torch::Tensor& bias_grad,         // [..., d]
-  const torch::Tensor& output_grad, // [3]
-  const torch::Tensor& input,       // [3]
-  const torch::Tensor& weight,      // [3]
-  double eps) {
-  AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
-  AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
-  AssertTensorNotNull(weight, "weight");
-  // TODO shape check
-  // weight_grad, bias_grad and input_grad can be nullable
-  constexpr int BLOCK_SIZE = 256;
-  int d = input.size(-1);
-  int64_t num_tokens = input.numel() / input.size(-1);
-  dim3 grid(num_tokens);
-  dim3 block(BLOCK_SIZE);
-  torch::Tensor temp_weight_grad =
-    torch::empty({num_tokens, 3},
-    input.options().dtype(torch::kFloat));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  MOTIF_DISPATCH_FLOATING_TYPES(
-    input.scalar_type(), "poly_norm_naive_backward_kernel", [&] {
-      motif::poly_norm_naive_backward_kernel<scalar_t, float, BLOCK_SIZE>
-        <<<grid, block, 0, stream>>>(
-          input_grad.data_ptr<scalar_t>(),
-          temp_weight_grad.data_ptr<float>(),
-          output_grad.data_ptr<scalar_t>(),
-          input.data_ptr<scalar_t>(),
-          weight.data_ptr<scalar_t>(),
-          eps, d);
-    }
-  );
-  if (bias_grad.defined()) {
-    at::sum_out(bias_grad, output_grad);
-    bias_grad.resize_({1});
-  }
-  if (weight_grad.defined()) {
-    at::sum_out(weight_grad, temp_weight_grad, {0});
-  }
-}