Motif-Technologies
/

activation

kernel

Model card Files Files and versions

xet

Community

ca1207 commited on 25 days ago

Commit

18ec195

1 Parent(s): 43629b7

add poly_norm_naive.cui for temp test

Browse files

Files changed (1) hide show

activation/poly_norm_naive.cu +246 -0

activation/poly_norm_naive.cu ADDED Viewed

	@@ -0,0 +1,246 @@

+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/Functions.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cmath>
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+#include "assert_utils.h"
+#include "atomic_utils.h"
+#include "block_reduce.h"
+namespace motif {
+template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void poly_norm_naive_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., d]
+    const scalar_t* __restrict__ weight, // [3]
+    const scalar_t* __restrict__ bias,   // [1]
+    const float eps,
+    const int d
+    ) {
+  const int64_t token_idx = blockIdx.x;
+  acc_t sum = 0.0f;
+  acc_t sum_square = 0.0f;
+  acc_t sum_cube = 0.0f;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    acc_t x = input[token_idx * d + idx];
+    sum += pow(x, 2.0f);
+    sum_square += pow(x, 4.0f);
+    sum_cube += pow(x, 6.0f);
+  }
+  __shared__ acc_t shared[BLOCK_SIZE];
+  acc_t mean = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum, d) / d;
+  acc_t mean_square = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_square, d) / d;
+  acc_t mean_cube = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_cube, d) / d;
+  acc_t w0 = weight[0];
+  acc_t w1 = weight[1];
+  acc_t w2 = weight[2];
+  acc_t b = bias[0];
+  acc_t divisor = sqrt(mean + eps);
+  acc_t divisor_square = sqrt(mean_square + eps);
+  acc_t divisor_cube  = sqrt(mean_cube + eps);
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    acc_t x = input[token_idx * d + idx];
+    acc_t x_square = pow(x, 2.0f);
+    acc_t x_cube = pow(x, 3.0f);
+    out[token_idx * d + idx] = w2 * x / divisor +
+                               w1 * x_square / divisor_square +
+                               w0 * x_cube / divisor_cube + b;
+  }
+}
+template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void poly_norm_naive_backward_kernel(
+    scalar_t* __restrict__ input_grad,         // [..., d]
+    acc_t* __restrict__ temp_weight_grad,      // [..., 3]
+    const scalar_t* __restrict__ output_grad,  // [..., d]
+    const scalar_t* __restrict__ input,        // [..., d]
+    const scalar_t* __restrict__ weight,       // [3]
+    const float eps,
+    const int d
+    ) {
+  const int64_t token_idx = blockIdx.x;
+  acc_t w0 = weight[0];
+  acc_t w1 = weight[1];
+  acc_t w2 = weight[2];
+  acc_t sum_2 = 0.0f;
+  acc_t sum_4 = 0.0f;
+  acc_t sum_6 = 0.0f;
+  acc_t sum_dx_1 = 0.0f;
+  acc_t sum_dx_2 = 0.0f;
+  acc_t sum_dx_3 = 0.0f;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    acc_t dy = output_grad[token_idx * d + idx];
+    acc_t x_1 = input[token_idx * d + idx];
+    acc_t x_2 = x_1 * x_1;
+    acc_t x_3 = x_2 * x_1;
+    acc_t x_4 = x_2 * x_2;
+    acc_t x_6 = x_3 * x_3;
+    sum_2 += x_2;
+    sum_4 += x_4;
+    sum_6 += x_6;
+    sum_dx_1 += dy * x_1;
+    sum_dx_2 += dy * x_2;
+    sum_dx_3 += dy * x_3;
+  }
+  __shared__ acc_t shared[BLOCK_SIZE];
+  acc_t mean_2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_2, d) / d + eps;
+  acc_t mean_4 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_4, d) / d + eps;
+  acc_t mean_6 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_6, d) / d + eps;
+  sum_dx_1 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_1, d);
+  sum_dx_2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_2, d);
+  sum_dx_3 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_3, d);
+  acc_t _mean_2 = powf(mean_2, -1.5);
+  acc_t _mean_4 = powf(mean_4, -1.5);
+  acc_t _mean_6 = powf(mean_6, -1.5);
+  acc_t sq_mean_2 = sqrtf(mean_2);
+  acc_t sq_mean_4 = sqrtf(mean_4);
+  acc_t sq_mean_6 = sqrtf(mean_6);
+  acc_t sum_dw0 = 0;
+  acc_t sum_dw1 = 0;
+  acc_t sum_dw2 = 0;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    acc_t dy = output_grad[token_idx * d + idx];
+    acc_t x_1 = input[token_idx * d + idx];
+    acc_t x_2 = x_1 * x_1;
+    acc_t x_3 = x_2 * x_1;
+    acc_t dx_3 =
+      _mean_6 * 3 * x_2 * (dy * mean_6 - x_3 * sum_dx_3 / d) * w0;
+    acc_t dx_2 =
+      _mean_4 * 2 * x_1 * (dy * mean_4 - x_2 * sum_dx_2 / d) * w1;
+    acc_t dx_1 =
+      _mean_2 * (dy * mean_2 - x_1 * sum_dx_1 / d) * w2;
+    if (input_grad) {
+      input_grad[token_idx * d + idx] = dx_1 + dx_2 + dx_3;
+    }
+    sum_dw0 += dy * (x_3 / sq_mean_6);
+    sum_dw1 += dy * (x_2 / sq_mean_4);
+    sum_dw2 += dy * (x_1 / sq_mean_2);
+  }
+  if (temp_weight_grad) {
+    sum_dw0 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw0, d);
+    sum_dw1 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw1, d);
+    sum_dw2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw2, d);
+    if (threadIdx.x == 0) {
+      temp_weight_grad[token_idx * 3 + 0] = sum_dw0;
+      temp_weight_grad[token_idx * 3 + 1] = sum_dw1;
+      temp_weight_grad[token_idx * 3 + 2] = sum_dw2;
+    }
+  }
+}
+}  // namespace motif
+void poly_norm_naive(torch::Tensor& out,          // [..., d]
+               const torch::Tensor& input,  // [..., d]
+               const torch::Tensor& weight, // [3]
+               const torch::Tensor& bias,   // [1]
+               double eps)
+{
+  AssertTensorShapeEqual(input, out, "input", "out");
+  AssertTensorNotNull(weight, "weight");
+  AssertTensorNotNull(bias, "bias");
+  // TODO shape check
+  constexpr int BLOCK_SIZE = 256;
+  int d = input.size(-1);
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+  dim3 block(BLOCK_SIZE);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  MOTIF_DISPATCH_FLOATING_TYPES(
+    input.scalar_type(), "poly_norm_naive_kernel", [&] {
+      motif::poly_norm_naive_kernel<scalar_t, float, BLOCK_SIZE>
+        <<<grid, block, 0, stream>>>(
+          out.data_ptr<scalar_t>(),
+          input.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          bias.data_ptr<scalar_t>(), eps, d);
+    }
+  );
+}
+void poly_norm_naive_backward(
+  torch::Tensor& input_grad,        // [..., d]
+  torch::Tensor& weight_grad,       // [..., d]
+  torch::Tensor& bias_grad,         // [..., d]
+  const torch::Tensor& output_grad, // [3]
+  const torch::Tensor& input,       // [3]
+  const torch::Tensor& weight,      // [3]
+  double eps) {
+  AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
+  AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
+  AssertTensorNotNull(weight, "weight");
+  // TODO shape check
+  // weight_grad, bias_grad and input_grad can be nullable
+  constexpr int BLOCK_SIZE = 256;
+  int d = input.size(-1);
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+  dim3 block(BLOCK_SIZE);
+  torch::Tensor temp_weight_grad =
+    torch::empty({num_tokens, 3},
+    input.options().dtype(torch::kFloat));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  MOTIF_DISPATCH_FLOATING_TYPES(
+    input.scalar_type(), "poly_norm_naive_backward_kernel", [&] {
+      motif::poly_norm_naive_backward_kernel<scalar_t, float, BLOCK_SIZE>
+        <<<grid, block, 0, stream>>>(
+          input_grad.data_ptr<scalar_t>(),
+          temp_weight_grad.data_ptr<float>(),
+          output_grad.data_ptr<scalar_t>(),
+          input.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          eps, d);
+    }
+  );
+  if (bias_grad.defined()) {
+    at::sum_out(bias_grad, output_grad);
+    bias_grad.resize_({1});
+  }
+  if (weight_grad.defined()) {
+    at::sum_out(weight_grad, temp_weight_grad, {0});
+  }
+}