fix(poly-norm): calc param grad explicitly

Browse files

Files changed (13) hide show

activation/activation_kernels.cu +28 -33
activation/atomic_utils.h +16 -16
build/flake.lock +168 -0
build/flake.nix +11 -0
build/torch26-cxx11-rocm62-x86_64-linux/activation/{_activation_32c2bde_dirty.abi3.so → _activation_883cc1c_dirty.abi3.so} +2 -2
build/torch26-cxx11-rocm62-x86_64-linux/activation/_activation_f72121c_dirty.abi3.so +0 -3
build/torch26-cxx11-rocm62-x86_64-linux/activation/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_32c2bde_dirty.abi3.so +0 -3
build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_552d415_dirty.abi3.so +0 -3
build/{torch26-cxx11-rocm62-x86_64-linux/activation/_activation_552d415_dirty.abi3.so → torch27-cxx11-rocm63-x86_64-linux/activation/_activation_883cc1c_dirty.abi3.so} +2 -2
build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_f72121c_dirty.abi3.so +0 -3
build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py +3 -3
tests/pytest.ini +3 -0

activation/activation_kernels.cu CHANGED Viewed

@@ -1,4 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -78,8 +79,7 @@ __global__ void poly_norm_kernel(
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
 __global__ void poly_norm_backward_kernel(
     scalar_t* __restrict__ input_grad,         // [..., d]
-    scalar_t* __restrict__ weight_grad,        // [3]
-    scalar_t* __restrict__ bias_grad,          // [1]
     const scalar_t* __restrict__ output_grad,  // [..., d]
     const scalar_t* __restrict__ input,        // [..., d]
     const scalar_t* __restrict__ weight,       // [3]
@@ -128,14 +128,17 @@ __global__ void poly_norm_backward_kernel(
   sum_dx_2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_2, d);
   sum_dx_3 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_3, d);
-  acc_t sq_mean_2 = powf(mean_2, -1.5);
-  acc_t sq_mean_4 = powf(mean_4, -1.5);
-  acc_t sq_mean_6 = powf(mean_6, -1.5);
   acc_t sum_dw0 = 0;
   acc_t sum_dw1 = 0;
   acc_t sum_dw2 = 0;
-  acc_t sum_db = 0;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     acc_t dy = output_grad[token_idx * d + idx];
@@ -144,38 +147,30 @@ __global__ void poly_norm_backward_kernel(
     acc_t x_3 = x_2 * x_1;
     acc_t dx_3 =
-      sq_mean_6 * 3 * x_2 * (dy * mean_6 - x_3 * sum_dx_3 / d) * w0;
     acc_t dx_2 =
-      sq_mean_4 * 2 * x_1 * (dy * mean_4 - x_2 * sum_dx_2 / d) * w1;
     acc_t dx_1 =
-      sq_mean_2 * (dy * mean_2 - x_1 * sum_dx_1 / d) * w2;
     if (input_grad) {
       input_grad[token_idx * d + idx] = dx_1 + dx_2 + dx_3;
     }
-    sum_dw0 += dy * (x_3 / sqrt(mean_6));
-    sum_dw1 += dy * (x_2 / sqrt(mean_4));
-    sum_dw2 += dy * (x_1 / sqrt(mean_2));
-    sum_db += dy;
   }
-  if (weight_grad) {
     sum_dw0 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw0, d);
     sum_dw1 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw1, d);
     sum_dw2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw2, d);
     if (threadIdx.x == 0) {
-      atomic_add(&weight_grad[0], sum_dw0);
-      atomic_add(&weight_grad[1], sum_dw1);
-      atomic_add(&weight_grad[2], sum_dw2);
-    }
-  }
-  if (bias_grad) {
-    sum_db = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_db, d);
-    if (threadIdx.x == 0) {
-      atomic_add(&bias_grad[0], sum_db);
     }
   }
 }
@@ -236,14 +231,11 @@ void poly_norm_backward(
   dim3 grid(num_tokens);
   dim3 block(BLOCK_SIZE);
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  if (weight_grad.defined()) {
-    cudaMemsetAsync(weight_grad.data_ptr(), 0, weight_grad.numel() * weight_grad.element_size(), stream);
-  }
-  if (bias_grad.defined()) {
-    cudaMemsetAsync(bias_grad.data_ptr(), 0, bias_grad.numel() * bias_grad.element_size(), stream);
-  }
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   MOTIF_DISPATCH_FLOATING_TYPES(
@@ -251,12 +243,15 @@ void poly_norm_backward(
       motif::poly_norm_backward_kernel<scalar_t, float, BLOCK_SIZE>
         <<<grid, block, 0, stream>>>(
           input_grad.data_ptr<scalar_t>(),
-          weight_grad.data_ptr<scalar_t>(),
-          bias_grad.data_ptr<scalar_t>(),
           output_grad.data_ptr<scalar_t>(),
           input.data_ptr<scalar_t>(),
           weight.data_ptr<scalar_t>(),
           eps, d);
     }
   );
 }

 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/Functions.h>
 #include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
 __global__ void poly_norm_backward_kernel(
     scalar_t* __restrict__ input_grad,         // [..., d]
+    acc_t* __restrict__ temp_weight_grad,      // [..., 3]
     const scalar_t* __restrict__ output_grad,  // [..., d]
     const scalar_t* __restrict__ input,        // [..., d]
     const scalar_t* __restrict__ weight,       // [3]
   sum_dx_2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_2, d);
   sum_dx_3 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dx_3, d);
+  acc_t _mean_2 = powf(mean_2, -1.5);
+  acc_t _mean_4 = powf(mean_4, -1.5);
+  acc_t _mean_6 = powf(mean_6, -1.5);
+  acc_t sq_mean_2 = sqrtf(mean_2);
+  acc_t sq_mean_4 = sqrtf(mean_4);
+  acc_t sq_mean_6 = sqrtf(mean_6);
   acc_t sum_dw0 = 0;
   acc_t sum_dw1 = 0;
   acc_t sum_dw2 = 0;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     acc_t dy = output_grad[token_idx * d + idx];
     acc_t x_3 = x_2 * x_1;
     acc_t dx_3 =
+      _mean_6 * 3 * x_2 * (dy * mean_6 - x_3 * sum_dx_3 / d) * w0;
     acc_t dx_2 =
+      _mean_4 * 2 * x_1 * (dy * mean_4 - x_2 * sum_dx_2 / d) * w1;
     acc_t dx_1 =
+      _mean_2 * (dy * mean_2 - x_1 * sum_dx_1 / d) * w2;
     if (input_grad) {
       input_grad[token_idx * d + idx] = dx_1 + dx_2 + dx_3;
     }
+    sum_dw0 += dy * (x_3 / sq_mean_6);
+    sum_dw1 += dy * (x_2 / sq_mean_4);
+    sum_dw2 += dy * (x_1 / sq_mean_2);
   }
+  if (temp_weight_grad) {
     sum_dw0 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw0, d);
     sum_dw1 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw1, d);
     sum_dw2 = _block_reduce_sum<acc_t, BLOCK_SIZE>(shared, sum_dw2, d);
     if (threadIdx.x == 0) {
+      temp_weight_grad[token_idx * 3 + 0] = sum_dw0;
+      temp_weight_grad[token_idx * 3 + 1] = sum_dw1;
+      temp_weight_grad[token_idx * 3 + 2] = sum_dw2;
     }
   }
 }
   dim3 grid(num_tokens);
   dim3 block(BLOCK_SIZE);
+  torch::Tensor temp_weight_grad =
+    torch::empty({num_tokens, 3},
+    input.options().dtype(torch::kFloat));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   MOTIF_DISPATCH_FLOATING_TYPES(
       motif::poly_norm_backward_kernel<scalar_t, float, BLOCK_SIZE>
         <<<grid, block, 0, stream>>>(
           input_grad.data_ptr<scalar_t>(),
+          temp_weight_grad.data_ptr<float>(),
           output_grad.data_ptr<scalar_t>(),
           input.data_ptr<scalar_t>(),
           weight.data_ptr<scalar_t>(),
           eps, d);
     }
   );
+  at::sum_out(bias_grad, output_grad);
+  at::sum_out(weight_grad, temp_weight_grad, {0});
+  bias_grad.resize_({1});
 }

activation/atomic_utils.h CHANGED Viewed

@@ -27,19 +27,19 @@ __device__ inline void atomic_add<c10::BFloat16, float>(c10::BFloat16* _address,
   size_t offset = (size_t)address & 0x2;
   volatile uint16_t* address_as_short =
-		reinterpret_cast<volatile uint16_t*>(reinterpret_cast<volatile char*>(address));
   volatile uint32_t* address_as_uint =
-		reinterpret_cast<volatile uint*>(reinterpret_cast<volatile char*>(address) - offset);
-	bool is_32bit_aligned = offset == 0;
-	uint32_t current = address_as_uint[0];
-	uint32_t expected;
   do {
     expected = current;
-	  c10::BFloat16 current_bf16(address_as_short[0], c10::BFloat16::from_bits());
-		c10::BFloat16 next_bf16 = current_bf16 + value;
-		uint32_t next = is_32bit_aligned ? (current & 0xffff0000) | next_bf16.x
                                      : (current & 0x0000ffff) | (next_bf16.x << 16);
     current = atomicCAS(const_cast<uint32_t*>(address_as_uint), expected, next);
   } while (current != expected);
@@ -51,19 +51,19 @@ __device__ inline void atomic_add<c10::Half, float>(c10::Half* _address, float v
   size_t offset = (size_t)address & 0x2;
   volatile uint16_t* address_as_short =
-		reinterpret_cast<volatile uint16_t*>(reinterpret_cast<volatile char*>(address));
   volatile uint32_t* address_as_uint =
-		reinterpret_cast<volatile uint*>(reinterpret_cast<volatile char*>(address) - offset);
-	bool is_32bit_aligned = offset == 0;
-	uint32_t current = address_as_uint[0];
-	uint32_t expected;
   do {
     expected = current;
-	  c10::Half current_half(address_as_short[0], c10::Half::from_bits());
-		c10::Half next_half = current_half + value;
-		uint32_t next = is_32bit_aligned ? (current & 0xffff0000) | next_half.x
                                      : (current & 0x0000ffff) | (next_half.x << 16);
     current = atomicCAS(const_cast<uint32_t*>(address_as_uint), expected, next);
   } while (current != expected);

   size_t offset = (size_t)address & 0x2;
   volatile uint16_t* address_as_short =
+      reinterpret_cast<volatile uint16_t*>(reinterpret_cast<volatile char*>(address));
   volatile uint32_t* address_as_uint =
+      reinterpret_cast<volatile uint*>(reinterpret_cast<volatile char*>(address) - offset);
+  bool is_32bit_aligned = offset == 0;
+  uint32_t current = address_as_uint[0];
+  uint32_t expected;
   do {
     expected = current;
+    c10::BFloat16 current_bf16(address_as_short[0], c10::BFloat16::from_bits());
+    c10::BFloat16 next_bf16 = current_bf16 + value;
+    uint32_t next = is_32bit_aligned ? (current & 0xffff0000) | next_bf16.x
                                      : (current & 0x0000ffff) | (next_bf16.x << 16);
     current = atomicCAS(const_cast<uint32_t*>(address_as_uint), expected, next);
   } while (current != expected);
   size_t offset = (size_t)address & 0x2;
   volatile uint16_t* address_as_short =
+      reinterpret_cast<volatile uint16_t*>(reinterpret_cast<volatile char*>(address));
   volatile uint32_t* address_as_uint =
+      reinterpret_cast<volatile uint*>(reinterpret_cast<volatile char*>(address) - offset);
+  bool is_32bit_aligned = offset == 0;
+  uint32_t current = address_as_uint[0];
+  uint32_t expected;
   do {
     expected = current;
+    c10::Half current_half(address_as_short[0], c10::Half::from_bits());
+    c10::Half next_half = current_half + value;
+    uint32_t next = is_32bit_aligned ? (current & 0xffff0000) | next_half.x
                                      : (current & 0x0000ffff) | (next_half.x << 16);
     current = atomicCAS(const_cast<uint32_t*>(address_as_uint), expected, next);
   } while (current != expected);

build/flake.lock ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1747919133,
+        "narHash": "sha256-VvF1naQOvv7yulQ5/cDiaxkNxlh1Y84QMZnderv1szk=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "9c71e026d6c7c8588ef85a5f7c77f57d598e038c",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
+        "nixpkgs": [
+          "kernel-builder",
+          "hf-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1748620233,
+        "narHash": "sha256-VULm9HgGXvo3pyfsPy3SOhoqgkuqbGSaSemvzNUbdIU=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "da3340e5b3cbb6086600420f4814b033395788d1",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1747820358,
+        "narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
+        "owner": "danieldk",
+        "repo": "nixpkgs",
+        "rev": "d3c1681180717528068082103bf323147de6ab0b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "danieldk",
+        "ref": "cudatoolkit-12.9-kernel-builder",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

build/flake.nix ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  description = "Flake for Torch kernel extension";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs = { self, kernel-builder, }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+    };
+}

build/torch26-cxx11-rocm62-x86_64-linux/activation/{_activation_32c2bde_dirty.abi3.so → _activation_883cc1c_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:151887149f1c434d7778bf213a758748e3fe15e3af5108c9a90fd679416c5ebe
-size 2425872

 version https://git-lfs.github.com/spec/v1
+oid sha256:a9d74188efdcb10158b338cf363749494f86e9712797722310f0a6ac5310efdd
+size 2401160

build/torch26-cxx11-rocm62-x86_64-linux/activation/_activation_f72121c_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b358f5be0dc4f1c1d7198ca4417c74cf9626f678b89772e178154acbaee1476a
-size 2460736

build/torch26-cxx11-rocm62-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_32c2bde_dirty
-ops = torch.ops._activation_32c2bde_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_32c2bde_dirty::{op_name}"

 import torch
+from . import _activation_883cc1c_dirty
+ops = torch.ops._activation_883cc1c_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_883cc1c_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_32c2bde_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c1f586de8406ba777d2d80bbcad8cc711032ef3971c1e963c7d31845c25b28c8
-size 2404376

build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_552d415_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:eecd2db703418250f22fbccdda97d8f45a15d3a8d34d1c6be1f0a1e3a7076990
-size 2447480

build/{torch26-cxx11-rocm62-x86_64-linux/activation/_activation_552d415_dirty.abi3.so → torch27-cxx11-rocm63-x86_64-linux/activation/_activation_883cc1c_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:98d5b88d4ef1ae793dc71c798416cc4d71b8d180a0a4627531ad7ab78116247d
-size 2464880

 version https://git-lfs.github.com/spec/v1
+oid sha256:719fc6521c0824b253cb11ea9e564ef7835e2102e5bc6399cfdb69203d6d5c26
+size 2395176

build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_f72121c_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d322fb12e4bd5eab4700783a6cfac4a8a9f9f21c7c61fd2ddb47253da8e182f1
-size 2447176

build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_32c2bde_dirty
-ops = torch.ops._activation_32c2bde_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_32c2bde_dirty::{op_name}"

 import torch
+from . import _activation_883cc1c_dirty
+ops = torch.ops._activation_883cc1c_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_883cc1c_dirty::{op_name}"

tests/pytest.ini ADDED Viewed

	@@ -0,0 +1,3 @@

+[pytest]
+log_cli = true
+log_cli_level = INFO