medmekk
/

rmsnorm_kernel

medmekk HF Staff commited on 12 days ago

Commit

ac27409

verified ·

1 Parent(s): 3cfa33d

Upload custom kernels

Files changed (5) hide show

build.toml ADDED Viewed

+[general]
+name = "rmsnorm_kernel"
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h"
+]
+[kernel.activation]
+src = [
+  "rmsnorm_kernel/rmsnorm.cpp",
+]
+depends = [ "torch"]
+# If the kernel is only supported on specific capabilities, set the
+# cuda-capabilities option:
+#
+# cuda-capabilities = [ "9.0", "10.0", "12.0" ]

rmsnorm_kernel/rmsnorm.cpp ADDED Viewed

+#include <torch/extension.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <cmath>
+const float EPS = 1e-5f;
+struct RmsnormFunctor {
+    const float* x;
+    const float* gamma;
+    float* out;
+    int hidden_dim;
+    RmsnormFunctor(const float* x_, const float* gamma_, float* out_, int h_)
+        : x(x_), gamma(gamma_), out(out_), hidden_dim(h_) {}
+    __device__
+    void operator()(int row_idx) {
+        const float* row_x = x + row_idx * hidden_dim;
+        float* row_out = out + row_idx * hidden_dim;
+        float sum_sq = 0.0f;
+        for (int i = 0; i < hidden_dim; ++i)
+            sum_sq += row_x[i] * row_x[i];
+        float rms = sqrtf(sum_sq / hidden_dim + EPS);
+        for (int i = 0; i < hidden_dim; ++i)
+            row_out[i] = (row_x[i] / rms) * gamma[i];
+    }
+};
+void rmsnorm_forward(torch::Tensor x, torch::Tensor gamma, torch::Tensor out) {
+    int B = x.size(0), S = x.size(1), H = x.size(2);
+    int rows = B * S;
+    const float* x_ptr = x.data_ptr<float>();
+    const float* gamma_ptr = gamma.data_ptr<float>();
+    float* out_ptr = out.data_ptr<float>();
+    thrust::counting_iterator<int> iter(0);
+    thrust::for_each(
+        thrust::device,
+        iter, iter + rows,
+        RmsnormFunctor(x_ptr, gamma_ptr, out_ptr, H)
+    );
+}

torch-ext/rmsnorm_kernel/__init__.py ADDED Viewed

File without changes

torch-ext/torch_bindings.cpp ADDED Viewed

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_bindings.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("rmsnorm_forward(Tensor! out, Tensor input, Tensor gamma) -> ()");
+  ops.impl("rmsnorm_forward", torch::kCUDA, &rmsnorm_forward);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_bindings.h ADDED Viewed

+#pragma once
+#include <torch/torch.h>
+void rmsnorm_forward(torch::Tensor &out, torch::Tensor const &input, torch::Tensor const &gamma);