danieldk HF Staff commited on May 20

Commit

5a84343

0 Parent(s):

Add Punica sgmv kernels

Browse files

Files changed (22) hide show

README.md +11 -0
bgmv/bgmv_all.cu +5 -0
bgmv/bgmv_config.h +88 -0
bgmv/bgmv_impl.cuh +296 -0
build.toml +53 -0
flake.lock +117 -0
flake.nix +17 -0
flashinfer/cp_async.cuh +187 -0
flashinfer/mma.cuh +410 -0
flashinfer/permuted_smem.cuh +95 -0
flashinfer/vec_dtypes.cuh +1262 -0
punica_kernels/punica_ops.cc +220 -0
sgmv/sgmv.h +10 -0
sgmv/sgmv_cutlass.cu +14 -0
sgmv/sgmv_cutlass.cuh +180 -0
sgmv_flashinfer/sgmv_all.cu +73 -0
sgmv_flashinfer/sgmv_config.h +17 -0
sgmv_flashinfer/sgmv_flashinfer.cuh +356 -0
tests/test_sgmv.py +125 -0
torch-ext/punica_sgmv/__init__.py +172 -0
torch-ext/torch_binding.cpp +23 -0
torch-ext/torch_binding.h +16 -0

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+license: apache-2.0
+tags:
+  - kernel
+---
+## Punica sgmv
+[Punica](https://github.com/punica-ai/punica) sgmv kernels with modifications
+from [Lorax](https://github.com/predibase/lorax).

bgmv/bgmv_all.cu ADDED Viewed

	@@ -0,0 +1,5 @@

+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half)
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16)

bgmv/bgmv_config.h ADDED Viewed

	@@ -0,0 +1,88 @@

+#pragma once
+template <int feat_in, int feat_out, typename T>
+void bgmv_kernel(T *__restrict__ Y, const T *__restrict__ X,
+                 T **__restrict__ W,
+                 const int64_t *__restrict__ indicies, int64_t y_offset,
+                 int64_t full_y_size, int64_t batch_size,
+                 int64_t layer_idx, float scale);
+// clang-format off
+#define FOR_BGMV_WIDE(f, T, narrow) \
+    f(T, narrow, 256) \
+    f(T, narrow, 512) \
+    f(T, narrow, 640) \
+    f(T, narrow, 768) \
+    f(T, narrow, 1024) \
+    f(T, narrow, 1152) \
+    f(T, narrow, 1280) \
+    f(T, narrow, 1536) \
+    f(T, narrow, 1728) \
+    f(T, narrow, 1792) \
+    f(T, narrow, 2048) \
+    f(T, narrow, 2304) \
+    f(T, narrow, 2560) \
+    f(T, narrow, 2752) \
+    f(T, narrow, 2816) \
+    f(T, narrow, 3072) \
+    f(T, narrow, 3456) \
+    f(T, narrow, 3584) \
+    f(T, narrow, 4096) \
+    f(T, narrow, 4480) \
+    f(T, narrow, 4608) \
+    f(T, narrow, 5120) \
+    f(T, narrow, 5504) \
+    f(T, narrow, 5632) \
+    f(T, narrow, 6144) \
+    f(T, narrow, 6848) \
+    f(T, narrow, 6912) \
+    f(T, narrow, 7168) \
+    f(T, narrow, 7680) \
+    f(T, narrow, 8192) \
+    f(T, narrow, 8960) \
+    f(T, narrow, 9216) \
+    f(T, narrow, 9472) \
+    f(T, narrow, 10240) \
+    f(T, narrow, 11008) \
+    f(T, narrow, 12288) \
+    f(T, narrow, 13696) \
+    f(T, narrow, 13824) \
+    f(T, narrow, 14336) \
+    f(T, narrow, 15360) \
+    f(T, narrow, 16384) \
+    f(T, narrow, 17920) \
+    f(T, narrow, 18944) \
+    f(T, narrow, 20480) \
+    f(T, narrow, 22016) \
+    f(T, narrow, 24576) \
+    f(T, narrow, 27392) \
+    f(T, narrow, 27648) \
+    f(T, narrow, 28672) \
+    f(T, narrow, 32000) \
+    f(T, narrow, 32256) \
+    f(T, narrow, 32512) \
+    f(T, narrow, 32768) \
+    f(T, narrow, 33024) \
+    f(T, narrow, 35840) \
+    f(T, narrow, 36864) \
+    f(T, narrow, 43264) \
+    f(T, narrow, 49152) \
+    f(T, narrow, 64000) \
+    f(T, narrow, 64256) \
+    f(T, narrow, 64512) \
+    f(T, narrow, 102400) \
+    f(T, narrow, 102656) \
+    f(T, narrow, 102912) \
+    f(T, narrow, 128000) \
+    f(T, narrow, 128256) \
+    f(T, narrow, 128512) \
+#define FOR_BGMV_WIDE_NARROW(f, T) \
+    FOR_BGMV_WIDE(f, T, 8) \
+    FOR_BGMV_WIDE(f, T, 16) \
+    FOR_BGMV_WIDE(f, T, 32) \
+    FOR_BGMV_WIDE(f, T, 64) \
+    FOR_BGMV_WIDE(f, T, 128)
+// clang-format on

bgmv/bgmv_impl.cuh ADDED Viewed

	@@ -0,0 +1,296 @@

+#pragma once
+#include <ATen/cuda/CUDAContext.h>
+#include <cooperative_groups.h>
+#include <cuda/pipeline>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <stdio.h>
+#include "flashinfer/vec_dtypes.cuh"
+namespace cg = cooperative_groups;
+// nthrs = (32, 4)
+template <int feat_in, int feat_out, size_t vec_size, size_t X_copy_size,
+          size_t W_copy_size, int tx, int ty, int tz, typename T>
+__global__ void
+bgmv_shrink_kernel(T* __restrict__ Y, const T* __restrict__ X,
+                   T** __restrict__ W,
+                   const int64_t* __restrict__ indicies, int64_t y_offset,
+                   int64_t full_y_size, int64_t layer_idx,
+                   float scale) {
+  size_t batch_idx = blockIdx.y;
+  int64_t idx = indicies[batch_idx];
+  if (idx < 0) {
+    return;
+  }
+  auto block = cg::this_thread_block();
+  size_t j = blockIdx.x;
+  constexpr size_t num_pipeline_stages = 2;
+  constexpr size_t tile_size = tx * ty * vec_size;
+  __shared__ T W_shared[num_pipeline_stages * tile_size];
+  __shared__ T X_shared[num_pipeline_stages * tile_size];
+  __shared__ float y_warpwise[ty];
+  size_t W_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size};
+  size_t X_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size};
+  auto pipe = cuda::make_pipeline();
+  const T* W_ptr = W[idx];
+  // pipeline load W/X and compute WX;
+  pipe.producer_acquire();
+  cuda::memcpy_async(W_shared + (threadIdx.y * tx + threadIdx.x) * vec_size,
+                     W_ptr + (layer_idx * feat_out + j) * feat_in +
+                         (threadIdx.y * tx + threadIdx.x) * vec_size,
+                     cuda::aligned_size_t<W_copy_size>(W_copy_size), pipe);
+  cuda::memcpy_async(X_shared + (threadIdx.y * tx + threadIdx.x) * vec_size,
+                     X + (batch_idx * feat_in) +
+                         (threadIdx.y * tx + threadIdx.x) * vec_size,
+                     cuda::aligned_size_t<X_copy_size>(X_copy_size), pipe);
+  pipe.producer_commit();
+  size_t copy_idx, compute_idx;
+  float y = 0.f;
+  flashinfer::vec_t<T, vec_size> x_vec;
+  flashinfer::vec_t<T, vec_size> w_vec;
+  size_t tile_idx;
+#pragma unroll
+  for (tile_idx = 1; tile_idx < (feat_in + tile_size - 1) / tile_size;
+       ++tile_idx) {
+    copy_idx = tile_idx % num_pipeline_stages;
+    // pipeline stage: async copy W fragment
+    pipe.producer_acquire();
+    if (tile_idx * tile_size + threadIdx.y * tx * vec_size < feat_in) {
+      cuda::memcpy_async(W_shared + W_shared_offset[copy_idx] +
+                             (threadIdx.y * tx + threadIdx.x) * vec_size,
+                         W_ptr + (layer_idx * feat_out + j) * feat_in +
+                             tile_idx * tile_size +
+                             (threadIdx.y * tx + threadIdx.x) * vec_size,
+                         cuda::aligned_size_t<W_copy_size>(W_copy_size), pipe);
+      cuda::memcpy_async(X_shared + X_shared_offset[copy_idx] +
+                             (threadIdx.y * tx + threadIdx.x) * vec_size,
+                         X + (batch_idx * feat_in) + tile_idx * tile_size +
+                             (threadIdx.y * tx + threadIdx.x) * vec_size,
+                         cuda::aligned_size_t<X_copy_size>(X_copy_size), pipe);
+    }
+    pipe.producer_commit();
+    compute_idx = (tile_idx - 1) % num_pipeline_stages;
+    // pipeline stage: compute WX
+    pipe.consumer_wait();
+    block.sync();
+    x_vec.load(X_shared + X_shared_offset[compute_idx] +
+               (threadIdx.y * tx + threadIdx.x) * vec_size);
+    w_vec.load(W_shared + W_shared_offset[compute_idx] +
+               (threadIdx.y * tx + threadIdx.x) * vec_size);
+    float sum = 0.f;
+#pragma unroll
+    for (size_t i = 0; i < vec_size; ++i) {
+      sum += float(w_vec[i]) * float(x_vec[i]) * scale;
+    }
+#pragma unroll
+    for (size_t offset = tx / 2; offset > 0; offset /= 2) {
+      sum += __shfl_down_sync(0xffffffff, sum, offset);
+    }
+    y_warpwise[threadIdx.y] = sum;
+    block.sync();
+#pragma unroll
+    for (size_t i = 0; i < ty; ++i) {
+      y += y_warpwise[i];
+    }
+    block.sync();
+    pipe.consumer_release();
+  }
+  compute_idx = (tile_idx - 1) % num_pipeline_stages;
+  // final pipeline stage
+  pipe.consumer_wait();
+  block.sync();
+  x_vec.load(X_shared + X_shared_offset[compute_idx] +
+             (threadIdx.y * tx + threadIdx.x) * vec_size);
+  w_vec.load(W_shared + W_shared_offset[compute_idx] +
+             (threadIdx.y * tx + threadIdx.x) * vec_size);
+  float sum = 0.f;
+#pragma unroll
+  for (size_t i = 0; i < vec_size; ++i) {
+    sum += float(w_vec[i]) * float(x_vec[i]) * scale;
+  }
+#pragma unroll
+  for (size_t offset = tx / 2; offset > 0; offset /= 2) {
+    sum += __shfl_down_sync(0xffffffff, sum, offset);
+  }
+  y_warpwise[threadIdx.y] =
+      ((tile_idx - 1) * tile_size + threadIdx.y * tx * vec_size < feat_in)
+          ? sum
+          : 0.f;
+  block.sync();
+#pragma unroll
+  for (size_t i = 0; i < ty; ++i) {
+    y += y_warpwise[i];
+  }
+  block.sync();
+  pipe.consumer_release();
+  // write Y;
+  if (block.thread_rank() == 0) {
+    Y[batch_idx * full_y_size + y_offset + j] += static_cast<T>(y);
+  }
+}
+// nthrs = (2, 16, 4)
+template <int feat_in, int feat_out, size_t vec_size, int tx, int ty, int tz,
+          typename T>
+__global__ void
+bgmv_expand_kernel(T* __restrict__ Y, const T* __restrict__ X,
+                   T** __restrict__ W,
+                   const int64_t* __restrict__ indicies, int64_t y_offset,
+                   int64_t full_y_size, int64_t layer_idx,
+                   float scale) {
+  size_t batch_idx = blockIdx.y;
+  int64_t idx = indicies[batch_idx];
+  if (idx < 0) {
+    return;
+  }
+  auto block = cg::this_thread_block();
+  size_t tile_idx = blockIdx.x;
+  const T* W_ptr = W[idx];
+  // load X;
+  flashinfer::vec_t<T, vec_size> x_vec;
+  x_vec.load(X + batch_idx * feat_in + threadIdx.x * vec_size);
+  // load W;
+  flashinfer::vec_t<T, vec_size> w_vec;
+  w_vec.load(W_ptr + (layer_idx * feat_out + tile_idx * tz * ty) * feat_in +
+             block.thread_rank() * vec_size);
+  float sum = 0.f;
+#pragma unroll
+  for (size_t i = 0; i < vec_size; ++i) {
+    sum += float(w_vec[i]) * float(x_vec[i]) * scale;
+  }
+  cg::thread_block_tile g = cg::tiled_partition<tx>(block);
+#pragma unroll
+  for (size_t offset = tx / 2; offset > 0; offset /= 2) {
+    sum += g.shfl_down(sum, offset);
+  }
+  sum = g.shfl(sum, 0);
+  if (threadIdx.x == 0) {
+    Y[batch_idx * full_y_size + y_offset + tile_idx * (tz * ty) +
+      threadIdx.z * ty + threadIdx.y] += static_cast<T>(sum);
+  }
+}
+template <int feat_in, int feat_out, typename T>
+void bgmv_kernel(T* __restrict__ Y, const T* __restrict__ X,
+                 T** __restrict__ W,
+                 const int64_t* __restrict__ indicies, int64_t y_offset,
+                 int64_t full_y_size, int64_t batch_size,
+                 int64_t layer_idx, float scale) {
+  constexpr size_t vec_size = 8;
+  constexpr int tz = 4;
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if constexpr (feat_in < feat_out) {
+    static_assert(feat_in % vec_size == 0);
+    constexpr int tx = feat_in / vec_size;
+    static_assert((32 % tx == 0 && feat_out % (32 / tx * tz) == 0) ||
+                  (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) ||
+                  (8 % tx == 0 && feat_out % (8 / tx * tz) == 0));
+    if constexpr (32 % tx == 0 && feat_out % (32 / tx * tz) == 0) {
+      constexpr int ty = 32 / tx;
+      dim3 nblks(feat_out / (ty * tz), batch_size);
+      dim3 nthrs(tx, ty, tz);
+      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
+          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
+                                        full_y_size, layer_idx,
+                                        scale);
+    } else if (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) {
+      constexpr int ty = 16 / tx;
+      dim3 nblks(feat_out / (ty * tz), batch_size);
+      dim3 nthrs(tx, ty, tz);
+      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
+          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
+                                        full_y_size, layer_idx,
+                                        scale);
+    } else {
+      constexpr int ty = 8 / tx;
+      dim3 nblks(feat_out / (ty * tz), batch_size);
+      dim3 nthrs(tx, ty, tz);
+      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
+          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
+                                        full_y_size, layer_idx,
+                                        scale);
+    }
+  } else {
+    static_assert(feat_in % (vec_size * 32) == 0 ||
+                  feat_in % (vec_size * 16) == 0 ||
+                  feat_in % (vec_size * 8) == 0);
+    if constexpr (feat_in % (vec_size * 32) == 0) {
+      constexpr int tx = 32;
+      constexpr int ty = 4;
+      dim3 nblks(feat_out, batch_size);
+      dim3 nthrs(tx, ty);
+      bgmv_shrink_kernel<feat_in, feat_out, vec_size, vec_size * sizeof(T),
+                         vec_size * sizeof(T), tx, ty, tz>
+          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
+                                        full_y_size, layer_idx,
+                                        scale);
+    } else if constexpr (feat_in % (vec_size / 2 * 32) == 0) {
+      constexpr int tx = 32;
+      constexpr int ty = 4;
+      dim3 nblks(feat_out, batch_size);
+      dim3 nthrs(tx, ty);
+      bgmv_shrink_kernel<feat_in, feat_out, vec_size / 2,
+                         vec_size * sizeof(T) / 2,
+                         vec_size * sizeof(T) / 2, tx, ty, tz>
+          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
+                                        full_y_size, layer_idx,
+                                        scale);
+    } else if constexpr (feat_in % (vec_size / 2 * 16) == 0) {
+      constexpr int tx = 16;
+      constexpr int ty = 4;
+      dim3 nblks(feat_out, batch_size);
+      dim3 nthrs(tx, ty);
+      bgmv_shrink_kernel<feat_in, feat_out, vec_size / 2,
+                         vec_size * sizeof(T) / 2,
+                         vec_size * sizeof(T) / 2, tx, ty, tz>
+          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
+                                        full_y_size, layer_idx,
+                                        scale);
+    }
+  }
+}
+#define INST_BGMV(feat_in, feat_out, T)                         \
+  template void bgmv_kernel<feat_in, feat_out>(                                \
+      T*  __restrict__ Y, const T* __restrict__ X,                      \
+      T** __restrict__ W, const int64_t* __restrict__ indicies,         \
+      int64_t y_offset, int64_t full_y_size, int64_t batch_size,               \
+      int64_t layer_idx, float scale);
+#define INST_BGMV_TWOSIDE(T, narrow, wide)                      \
+  INST_BGMV(narrow, wide, T)                                    \
+  INST_BGMV(wide, narrow, T)

build.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+[general]
+name = "punica_sgmv"
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h"
+]
+[kernel.sgmv]
+language = "cuda"
+src = [
+  "sgmv/sgmv_cutlass.cu",
+  "sgmv/sgmv_cutlass.cuh",
+]
+depends = [ "cutlass_3_8", "torch" ]
+[kernel.sgmv_flashinfer]
+language = "cuda"
+cuda-capabilities = [ "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0" ]
+src = [
+  "flashinfer/cp_async.cuh",
+  "flashinfer/mma.cuh",
+  "flashinfer/permuted_smem.cuh",
+  "flashinfer/vec_dtypes.cuh",
+  "sgmv_flashinfer/sgmv_all.cu",
+  "sgmv_flashinfer/sgmv_config.h",
+  "sgmv_flashinfer/sgmv_flashinfer.cuh"
+]
+include = [ "." ]
+depends = [ "torch" ]
+[kernel.bgmv]
+language = "cuda"
+src = [
+  "bgmv/bgmv_all.cu",
+  "bgmv/bgmv_impl.cuh",
+  "bgmv/bgmv_config.h",
+  "flashinfer/vec_dtypes.cuh"
+]
+include = [ "." ]
+depends = [ "torch" ]
+[kernel.punica_kernels]
+language = "cuda"
+src = [
+  "bgmv/bgmv_config.h",
+  "punica_kernels/punica_ops.cc",
+  "sgmv/sgmv.h",
+  "sgmv_flashinfer/sgmv_config.h"
+]
+include = [ "." ]
+depends = [ "torch" ]

flake.lock ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs",
+        "rocm-nix": "rocm-nix"
+      },
+      "locked": {
+        "lastModified": 1747143871,
+        "narHash": "sha256-gXYPmA7wBqcTy1+39Z/UAIZ5mCSl9W5IoAvDQhIezec=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "a78a83cfb31373e0782921999e1917b7f91af7d3",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1746711195,
+        "narHash": "sha256-bSpM2ySq12PBOVN7jZdzXsc99iRoYOyolh5wz43+CjQ=",
+        "owner": "danieldk",
+        "repo": "nixpkgs",
+        "rev": "6b7a66b06ccb09ac95872ac6ddf952e0660672ab",
+        "type": "github"
+      },
+      "original": {
+        "owner": "danieldk",
+        "ref": "kernel-builder-cuda-12.9.0",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "rocm-nix": {
+      "inputs": {
+        "nixpkgs": [
+          "kernel-builder",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1745310663,
+        "narHash": "sha256-1U3PzCO/jt7HUlEgLOY3RpxadKwTo6GSvb2j4m0UFw0=",
+        "owner": "huggingface",
+        "repo": "rocm-nix",
+        "rev": "e08373a0efa1c297b0c57af070e0a311df47481f",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "rocm-nix",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  description = "Flake for Punica SGMV kernel";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+    };
+}

flashinfer/cp_async.cuh ADDED Viewed

	@@ -0,0 +1,187 @@

+/*
+ * Copyright (c) 2023 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FLASHINFER_CP_ASYNC_CUH_
+#define FLASHINFER_CP_ASYNC_CUH_
+#include <cuda_runtime.h>
+namespace flashinfer {
+namespace cp_async {
+enum class SharedMemFillMode {
+  kFillZero,  // Fill zero to shared memory when predicate is false
+  kNoFill     // Do not fill zero to shared memory when predicate is false
+};
+enum class PrefetchMode {
+  kNoPrefetch,  // Do not fetch additional data from global memory to L2
+  kPrefetch     // Fetch additional data from global memory to L2
+};
+#if (__CUDACC_VER_MAJOR__ >= 11)
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))
+#define FLASHINFER_CP_ASYNC_ENABLED
+#endif
+#endif
+/*!
+ * \brief Wrapper of PTX cp.async.commit_group instruction, commit all prior uncommitted
+ *   cp.async instructions to a group
+ */
+__device__ __forceinline__ void commit_group() {
+#ifdef FLASHINFER_CP_ASYNC_ENABLED
+  asm volatile("cp.async.commit_group;\n" ::);
+#endif
+}
+/*!
+ * \brief Wrapper of PTX cp.async.wait_group instruction
+ * \tparam n Wait till most recent n groups are committed
+ */
+template <size_t n>
+__device__ __forceinline__ void wait_group() {
+#ifdef FLASHINFER_CP_ASYNC_ENABLED
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+#endif
+}
+/*!
+ * \brief Wrapper of PTX cp.async.cg.shared.global instruction, asynchronously copy data from
+ *   global memory to shared memory
+ * \tparam prefetch_mode Whether to fetch additional data from global memory to L2
+ * \tparam T Data type
+ * \param smem_ptr Pointer to shared memory
+ * \param gmem_ptr Pointer to global memory
+ */
+template <PrefetchMode prefetch_mode, typename T>
+__device__ __forceinline__ void load_128b(T* smem_ptr, const T* gmem_ptr) {
+#ifdef FLASHINFER_CP_ASYNC_ENABLED
+  uint32_t smem_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  if constexpr (prefetch_mode == PrefetchMode::kPrefetch) {
+    asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
+                 "l"(gmem_ptr), "n"(16), "r"(16));
+  } else {
+    asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
+                 "l"(gmem_ptr), "n"(16), "r"(16));
+  }
+#else
+  *((uint4*)smem_ptr) = *((uint4*)gmem_ptr);
+#endif
+}
+/*!
+ * \brief Wrapper of PTX cp.async.cg.shared.global instruction, asynchronously copy data from
+ *   global memory to shared memory with predicate.
+ * \tparam prefetch_mode Whether to fetch additional data from global memory to L2
+ * \tparam fill_mode Whether to fill zero to shared memory when predicate is false
+ * \tparam T Data type
+ * \param smem_ptr Pointer to shared memory
+ * \param gmem_ptr Pointer to global memory
+ * \param predicate Predicate value
+ * \note fill zero is slower than not fill zero
+ */
+template <PrefetchMode prefetch_mode, SharedMemFillMode fill_mode, typename T>
+__device__ __forceinline__ void pred_load_128b(T* smem_ptr, const T* gmem_ptr, bool predicate) {
+#ifdef FLASHINFER_CP_ASYNC_ENABLED
+  uint32_t smem_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  if constexpr (fill_mode == SharedMemFillMode::kFillZero) {
+    int src_in_bytes = predicate ? 16 : 0;
+    if constexpr (prefetch_mode == PrefetchMode::kPrefetch) {
+      asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
+                   "l"(gmem_ptr), "n"(16), "r"(src_in_bytes));
+    } else {
+      asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
+                   "l"(gmem_ptr), "n"(16), "r"(src_in_bytes));
+    }
+  } else {
+    if constexpr (prefetch_mode == PrefetchMode::kPrefetch) {
+      asm volatile(
+          "{\n"
+          " .reg .pred p;\n"
+          " setp.ne.b32 p, %0, 0;\n"
+          " @p cp.async.cg.shared.global.L2::128B [%1], [%2], %3;\n"
+          "}\n" ::"r"((int)predicate),
+          "r"(smem_int_ptr), "l"(gmem_ptr), "n"(16));
+    } else {
+      asm volatile(
+          "{\n"
+          " .reg .pred p;\n"
+          " setp.ne.b32 p, %0, 0;\n"
+          " @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+          "}\n" ::"r"((int)predicate),
+          "r"(smem_int_ptr), "l"(gmem_ptr), "n"(16));
+    }
+  }
+#else
+  if (predicate) {
+    *((uint4*)smem_ptr) = *((uint4*)gmem_ptr);
+  } else {
+    if constexpr (fill_mode == SharedMemFillMode::kFillZero) {
+      *((uint4*)smem_ptr) = make_uint4(0, 0, 0, 0);
+    }
+  }
+#endif
+}
+/*!
+ * \brief Load specified number of bits per thread from global memory to shared memory
+ * \tparam num_bits Number of bits to load, must be 128 or 256
+ * \tparam prefetch_mode Whether to fetch additional data from global memory to L2
+ * \tparam T Data type
+ * \param smem_ptr Pointer to shared memory
+ * \param gmem_ptr Pointer to global memory
+ */
+template <size_t num_bits, PrefetchMode prefetch_mode, typename T>
+__device__ __forceinline__ void load(T* smem_ptr, const T* gmem_ptr) {
+  static_assert(num_bits == 128 || num_bits == 256, "num_bits must be 128 or 256");
+  if constexpr (num_bits == 128) {
+    load_128b<prefetch_mode>(smem_ptr, gmem_ptr);
+  } else {
+    load_128b<prefetch_mode>(smem_ptr, gmem_ptr);
+    load_128b<prefetch_mode>(smem_ptr + 16 / sizeof(T), gmem_ptr + 16 / sizeof(T));
+  }
+}
+/*!
+ * \brief Load specified number of bits per thread from global memory to shared memory with
+ *   predicate
+ * \tparam num_bits Number of bits to load, must be 128 or 256
+ * \tparam prefetch_mode Whether to fetch additional data from global memory to L2
+ * \tparam fill_mode Whether to fill zero to shared memory when predicate is false
+ * \tparam T Data type
+ * \param smem_ptr Pointer to shared memory
+ * \param gmem_ptr Pointer to global memory
+ * \param predicate Predicate value
+ * \note fill zero is slower than not fill zero
+ */
+template <size_t num_bits, PrefetchMode prefetch_mode, SharedMemFillMode fill_mode, typename T>
+__device__ __forceinline__ void pred_load(T* smem_ptr, const T* gmem_ptr, bool predicate) {
+  static_assert(num_bits == 128 || num_bits == 256, "num_bits must be 128 or 256");
+  if constexpr (num_bits == 128) {
+    pred_load_128b<prefetch_mode, fill_mode>(smem_ptr, gmem_ptr, predicate);
+  } else {
+    pred_load_128b<prefetch_mode, fill_mode>(smem_ptr, gmem_ptr, predicate);
+    pred_load_128b<prefetch_mode, fill_mode>(smem_ptr + 16 / sizeof(T), gmem_ptr + 16 / sizeof(T),
+                                             predicate);
+  }
+}
+}  // namespace cp_async
+}  // namespace flashinfer
+#endif  // FLASHINFER_CP_ASYNC_CUH_

flashinfer/mma.cuh ADDED Viewed

	@@ -0,0 +1,410 @@

+/*
+ * Copyright (c) 2023 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FLASHINFER_MMA_CUH_
+#define FLASHINFER_MMA_CUH_
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <type_traits>
+namespace flashinfer {
+namespace mma {
+#if (__CUDACC_VER_MAJOR__ >= 11)
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 900))
+#define FLASHINFER_STMATRIX_M8N8X4_ENABLED
+#endif
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))
+#define FLASHINFER_MMA_F16F16F32_M16N8K16_ENABLED
+#define FLASHINFER_MMA_F16F16F16_M16N8K16_ENABLED
+#endif
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 750))
+#define FLASHINFER_MMA_F16F16F32_M16N8K8_ENABLED
+#define FLASHINFER_MMA_F16F16F16_M16N8K8_ENABLED
+#define FLASHINFER_LDMATRIX_M8N8X4_ENABLED
+#endif
+#endif
+enum class MMAMode {
+  kInit = 0U,
+  kInplaceUpdate = 1U,
+};
+/*!
+ * \brief Wrapper of PTX ldmatrix m8n8.x4 instruction, loads data from shared memory
+ *   to fragment
+ * \tparam T data type of the fragment
+ * \param R pointer to the fragment
+ * \param smem_ptr pointer to the shared memory
+ */
+template <typename T>
+__device__ __forceinline__ void ldmatrix_m8n8x4(uint32_t* R, T* smem_ptr) {
+#ifdef FLASHINFER_LDMATRIX_M8N8X4_ENABLED
+  uint32_t smem_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+               : "=r"(R[0]), "=r"(R[1]), "=r"(R[2]), "=r"(R[3])
+               : "r"(smem_int_ptr));
+#else
+#error "Unsupported CUDA architecture for ldmatrix instruction"
+#endif
+}
+/*!
+ * \brief Wrapper of PTX ldmatrix m8n8.x4 transposed instruction, loads data from
+ *   shared memory to fragment and transposes the fragment
+ * \tparam T data type of the fragment
+ * \param R pointer to the fragment
+ * \param smem_ptr pointer to the shared memory
+ */
+template <typename T>
+__device__ __forceinline__ void ldmatrix_m8n8x4_trans(uint32_t* R, T* smem_ptr) {
+#ifdef FLASHINFER_LDMATRIX_M8N8X4_ENABLED
+  uint32_t smem_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.trans.m8n8.x4.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+               : "=r"(R[0]), "=r"(R[1]), "=r"(R[2]), "=r"(R[3])
+               : "r"(smem_int_ptr));
+#else
+#error "Unsupported CUDA architecture for ldmatrix instruction"
+#endif
+}
+/*!
+ * \brief Wrapper of PTX stmatrix m8n8.x4 instruction, stores data from fragment
+ *   to shared memory
+ * \tparam T data type of the fragment
+ * \param R pointer to the fragment
+ * \param smem_ptr pointer to the shared memory
+ */
+template <typename T>
+__device__ __forceinline__ void stmatrix_m8n8x4(uint32_t* R, T* smem_ptr) {
+#ifdef FLASHINFER_STMATRIX_M8N8X4_ENABLED
+  uint32_t smem_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("stmatrix.sync.aligned.m8n8.x4.shared.b16 [%0], {%1, %2, %3, %4};\n"
+               :
+               : "r"(smem_int_ptr), "r"(R[0]), "r"(R[1]), "r"(R[2]), "r"(R[3]));
+#else
+  // Fallback implementation, slower than PTX instruction
+  const uint32_t tx = threadIdx.x;
+  uint4 word;
+#pragma unroll
+  for (uint32_t reg_id = 0; reg_id < 4; ++reg_id) {
+    word.x = __shfl_sync(0xffffffff, R[reg_id], (tx % 8) * 4);
+    word.y = __shfl_sync(0xffffffff, R[reg_id], (tx % 8) * 4 + 1);
+    word.z = __shfl_sync(0xffffffff, R[reg_id], (tx % 8) * 4 + 2);
+    word.w = __shfl_sync(0xffffffff, R[reg_id], (tx % 8) * 4 + 3);
+    if (tx / 8 == reg_id) {
+      *(uint4*)smem_ptr = word;
+    }
+  }
+#endif
+}
+/*!
+ * \brief Wrapper of two mma m16n8k16 instructions for row major and column major f16 matrix
+ *   multiplication, accumulated in f32.
+ * \tparam T data type of the fragment
+ * \tparam mma_mode whether we are initializing the accumulator or updating it
+ * \param C pointer to the accumulator
+ * \param A pointer to the fragment of matrix A
+ * \param B pointer to the fragment of matrix B
+ */
+template <typename T, MMAMode mma_mode = MMAMode::kInplaceUpdate>
+__device__ __forceinline__ void mma_sync_m16n16k16_row_col_f16f16f32(float* C, uint32_t* A,
+                                                                     uint32_t* B) {
+#if defined(FLASHINFER_MMA_F16F16F32_M16N8K16_ENABLED)
+  if constexpr (mma_mode == MMAMode::kInit) {
+    if constexpr (std::is_same<T, half>::value) {
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,  %1,  %2,  %3},"
+          "{%4,  %5,  %6,  %7},"
+          "{%8,  %9},"
+          "{%10, %11, %12, %13};\n"
+          : "=f"(C[0]), "=f"(C[1]), "=f"(C[2]), "=f"(C[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "f"(0.f), "f"(0.f),
+            "f"(0.f), "f"(0.f));
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,  %1,  %2,  %3},"
+          "{%4,  %5,  %6,  %7},"
+          "{%8,  %9},"
+          "{%10, %11, %12, %13};\n"
+          : "=f"(C[4]), "=f"(C[5]), "=f"(C[6]), "=f"(C[7])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[2]), "r"(B[3]), "f"(0.f), "f"(0.f),
+            "f"(0.f), "f"(0.f));
+    } else {
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,  %1,  %2,  %3},"
+          "{%4,  %5,  %6,  %7},"
+          "{%8,  %9},"
+          "{%10, %11, %12, %13};\n"
+          : "=f"(C[0]), "=f"(C[1]), "=f"(C[2]), "=f"(C[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "f"(0.f), "f"(0.f),
+            "f"(0.f), "f"(0.f));
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,  %1,  %2,  %3},"
+          "{%4,  %5,  %6,  %7},"
+          "{%8,  %9},"
+          "{%10, %11, %12, %13};\n"
+          : "=f"(C[4]), "=f"(C[5]), "=f"(C[6]), "=f"(C[7])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[2]), "r"(B[3]), "f"(0.f), "f"(0.f),
+            "f"(0.f), "f"(0.f));
+    }
+  } else {
+    if constexpr (std::is_same<T, half>::value) {
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,  %1,  %2,  %3},"
+          "{%4,  %5,  %6,  %7},"
+          "{%8,  %9},"
+          "{%10, %11, %12, %13};\n"
+          : "=f"(C[0]), "=f"(C[1]), "=f"(C[2]), "=f"(C[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "f"(C[0]), "f"(C[1]),
+            "f"(C[2]), "f"(C[3]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,  %1,  %2,  %3},"
+          "{%4,  %5,  %6,  %7},"
+          "{%8,  %9},"
+          "{%10, %11, %12, %13};\n"
+          : "=f"(C[4]), "=f"(C[5]), "=f"(C[6]), "=f"(C[7])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[2]), "r"(B[3]), "f"(C[4]), "f"(C[5]),
+            "f"(C[6]), "f"(C[7]));
+    } else {
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,  %1,  %2,  %3},"
+          "{%4,  %5,  %6,  %7},"
+          "{%8,  %9},"
+          "{%10, %11, %12, %13};\n"
+          : "=f"(C[0]), "=f"(C[1]), "=f"(C[2]), "=f"(C[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "f"(C[0]), "f"(C[1]),
+            "f"(C[2]), "f"(C[3]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,  %1,  %2,  %3},"
+          "{%4,  %5,  %6,  %7},"
+          "{%8,  %9},"
+          "{%10, %11, %12, %13};\n"
+          : "=f"(C[4]), "=f"(C[5]), "=f"(C[6]), "=f"(C[7])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[2]), "r"(B[3]), "f"(C[4]), "f"(C[5]),
+            "f"(C[6]), "f"(C[7]));
+    }
+  }
+#elif defined(FLASHINFER_MMA_F16F16F32_M16N8K8_ENABLED)
+  static_assert(std::is_same<T, half>::value, "bf16 mma instruction is not supported on sm_75");
+  if constexpr (mma_mode == MMAMode::kInit) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+        "{%0,  %1,  %2,  %3},"
+        "{%4,  %5},"
+        "{%6},"
+        "{%7, %8, %9, %10};\n"
+        : "=f"(C[0]), "=f"(C[1]), "=f"(C[2]), "=f"(C[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(0.f), "f"(0.f), "f"(0.f), "f"(0.f));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+        "{%0,  %1,  %2,  %3},"
+        "{%4,  %5},"
+        "{%6},"
+        "{%7, %8, %9, %10};\n"
+        : "=f"(C[0]), "=f"(C[1]), "=f"(C[2]), "=f"(C[3])
+        : "r"(A[2]), "r"(A[3]), "r"(B[1]), "f"(0.f), "f"(0.f), "f"(0.f), "f"(0.f));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+        "{%0,  %1,  %2,  %3},"
+        "{%4,  %5},"
+        "{%6},"
+        "{%7, %8, %9, %10};\n"
+        : "=f"(C[4]), "=f"(C[5]), "=f"(C[6]), "=f"(C[7])
+        : "r"(A[0]), "r"(A[1]), "r"(B[2]), "f"(0.f), "f"(0.f), "f"(0.f), "f"(0.f));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+        "{%0,  %1,  %2,  %3},"
+        "{%4,  %5},"
+        "{%6},"
+        "{%7, %8, %9, %10};\n"
+        : "=f"(C[4]), "=f"(C[5]), "=f"(C[6]), "=f"(C[7])
+        : "r"(A[2]), "r"(A[3]), "r"(B[3]), "f"(0.f), "f"(0.f), "f"(0.f), "f"(0.f));
+  } else {
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+        "{%0,  %1,  %2,  %3},"
+        "{%4,  %5},"
+        "{%6},"
+        "{%7, %8, %9, %10};\n"
+        : "=f"(C[0]), "=f"(C[1]), "=f"(C[2]), "=f"(C[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+        "{%0,  %1,  %2,  %3},"
+        "{%4,  %5},"
+        "{%6},"
+        "{%7, %8, %9, %10};\n"
+        : "=f"(C[0]), "=f"(C[1]), "=f"(C[2]), "=f"(C[3])
+        : "r"(A[2]), "r"(A[3]), "r"(B[1]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+        "{%0,  %1,  %2,  %3},"
+        "{%4,  %5},"
+        "{%6},"
+        "{%7, %8, %9, %10};\n"
+        : "=f"(C[4]), "=f"(C[5]), "=f"(C[6]), "=f"(C[7])
+        : "r"(A[0]), "r"(A[1]), "r"(B[2]), "f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7]));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+        "{%0,  %1,  %2,  %3},"
+        "{%4,  %5},"
+        "{%6},"
+        "{%7, %8, %9, %10};\n"
+        : "=f"(C[4]), "=f"(C[5]), "=f"(C[6]), "=f"(C[7])
+        : "r"(A[2]), "r"(A[3]), "r"(B[3]), "f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7]));
+  }
+#else
+#error "Unsupported CUDA architecture for mma instruction"
+#endif
+}
+/*!
+ * \brief Wrapper of two mma m16n8k16 instructions for row major and column major f16 matrix
+ *   multiplication, accumulated in f16.
+ * \tparam mma_mode whether we are initializing the accumulator or updating it
+ * \param C pointer to the accumulator
+ * \param A pointer to the fragment of matrix A
+ * \param B pointer to the fragment of matrix B
+ */
+template <MMAMode mma_mode = MMAMode::kInplaceUpdate>
+__device__ __forceinline__ void mma_sync_m16n16k16_row_col_f16f16f16(uint32_t* C, uint32_t* A,
+                                                                     uint32_t* B) {
+#if defined(FLASHINFER_MMA_F16F16F16_M16N8K16_ENABLED)
+  if constexpr (mma_mode == MMAMode::kInit) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+        "{%0,  %1},"
+        "{%2,  %3,  %4,  %5},"
+        "{%6,  %7},"
+        "{%8,  %9};\n"
+        : "=r"(C[0]), "=r"(C[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(0), "r"(0));
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+        "{%0,  %1},"
+        "{%2,  %3,  %4,  %5},"
+        "{%6,  %7},"
+        "{%8,  %9};\n"
+        : "=r"(C[2]), "=r"(C[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[2]), "r"(B[3]), "r"(0), "r"(0));
+  } else {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+        "{%0,  %1},"
+        "{%2,  %3,  %4,  %5},"
+        "{%6,  %7},"
+        "{%8,  %9};\n"
+        : "=r"(C[0]), "=r"(C[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]));
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+        "{%0,  %1},"
+        "{%2,  %3,  %4,  %5},"
+        "{%6,  %7},"
+        "{%8,  %9};\n"
+        : "=r"(C[2]), "=r"(C[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[2]), "r"(B[3]), "r"(C[2]), "r"(C[3]));
+  }
+#elif defined(FLASHINFER_MMA_F16F16F16_M16N8K8_ENABLED)
+  if constexpr (mma_mode == MMAMode::kInit) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+        "{%0,  %1},"
+        "{%2,  %3},"
+        "{%4},"
+        "{%5, %6};\n"
+        : "=r"(C[0]), "=r"(C[1])
+        : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(0), "r"(0));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+        "{%0,  %1},"
+        "{%2,  %3},"
+        "{%4},"
+        "{%5, %6};\n"
+        : "=r"(C[0]), "=r"(C[1])
+        : "r"(A[2]), "r"(A[3]), "r"(B[1]), "r"(0), "r"(0));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+        "{%0,  %1},"
+        "{%2,  %3},"
+        "{%4},"
+        "{%5, %6};\n"
+        : "=r"(C[2]), "=r"(C[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B[2]), "r"(0), "r"(0));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+        "{%0,  %1},"
+        "{%2,  %3},"
+        "{%4},"
+        "{%5, %6};\n"
+        : "=r"(C[2]), "=r"(C[3])
+        : "r"(A[2]), "r"(A[3]), "r"(B[3]), "r"(0), "r"(0));
+  } else {
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+        "{%0,  %1},"
+        "{%2,  %3},"
+        "{%4},"
+        "{%5, %6};\n"
+        : "=r"(C[0]), "=r"(C[1])
+        : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+        "{%0,  %1},"
+        "{%2,  %3},"
+        "{%4},"
+        "{%5, %6};\n"
+        : "=r"(C[0]), "=r"(C[1])
+        : "r"(A[2]), "r"(A[3]), "r"(B[1]), "r"(C[0]), "r"(C[1]));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+        "{%0,  %1},"
+        "{%2,  %3},"
+        "{%4},"
+        "{%5, %6};\n"
+        : "=r"(C[2]), "=r"(C[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B[2]), "r"(C[2]), "r"(C[3]));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+        "{%0,  %1},"
+        "{%2,  %3},"
+        "{%4},"
+        "{%5, %6};\n"
+        : "=r"(C[2]), "=r"(C[3])
+        : "r"(A[2]), "r"(A[3]), "r"(B[3]), "r"(C[2]), "r"(C[3]));
+  }
+#else
+#error "Unsupported CUDA architecture for mma instruction"
+#endif
+}
+}  // namespace mma
+}  // namespace flashinfer
+#endif  // FLASHINFER_MMA_CUH_

flashinfer/permuted_smem.cuh ADDED Viewed

	@@ -0,0 +1,95 @@

+/*
+ * Copyright (c) 2023 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FLASHINFER_PERMUTED_SMEM_CUH_
+#define FLASHINFER_PERMUTED_SMEM_CUH_
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <cuda/pipeline>
+#include "cp_async.cuh"
+#include "mma.cuh"
+namespace flashinfer {
+// Each cell is 4 bytes.
+using cell_t = uint4;
+/*!
+ * \brief Compute the number of elements that can be stored in a cell.
+ * \tparam T The data type of the elements.
+ */
+template <typename T>
+constexpr __host__ __device__ __forceinline__ uint32_t cell_capacity() {
+  return sizeof(cell_t) / sizeof(T);
+}
+/*!
+ * \brief The shared memory wrapper.
+ */
+struct smem_t {
+  // The base pointer.
+  cell_t* base;
+  __device__ __forceinline__ smem_t() : base(nullptr) {}
+  template <typename T>
+  __device__ __forceinline__ smem_t(T* base) : base((cell_t*)base) {}
+  /*!
+   * \brief Compute the element offset given coordinates in a permuted shared memory.
+   * \tparam stride The stride (in terms of cells) in the permuted shared memory.
+   * \param i The row index.
+   * \param j The column index.
+   */
+  template <uint32_t stride>
+  static __device__ __forceinline__ uint32_t get_permuted_offset(uint32_t i, uint32_t j) {
+    return (i / 2) * stride * 2 + (j / 4) * 8 + (i % 2) * 4 + ((j % 4) ^ ((i / 2) % 4));
+  }
+  __device__ __forceinline__ void ldmatrix_m8n8x4(uint32_t offset, uint32_t* R) {
+    cell_t* smem_ptr = base + offset;
+    mma::ldmatrix_m8n8x4(R, smem_ptr);
+  }
+  __device__ __forceinline__ void stmatrix_m8n8x4(uint32_t offset, uint32_t* R) {
+    cell_t* smem_ptr = base + offset;
+    mma::stmatrix_m8n8x4(R, smem_ptr);
+  }
+  __device__ __forceinline__ void ldmatrix_m8n8x4_trans(uint32_t offset, uint32_t* R) {
+    cell_t* smem_ptr = base + offset;
+    mma::ldmatrix_m8n8x4_trans(R, smem_ptr);
+  }
+  template <cp_async::SharedMemFillMode fill_mode, typename T>
+  __device__ __forceinline__ void load_128b_async(uint32_t offset, const T* gptr, bool predicate) {
+    cell_t* smem_ptr = base + offset;
+    cp_async::pred_load_128b<cp_async::PrefetchMode::kPrefetch, fill_mode>(
+        smem_ptr, reinterpret_cast<const cell_t*>(gptr), predicate);
+  }
+  template <typename T>
+  __device__ __forceinline__ void load_128b_async(uint32_t offset, const T* gptr) {
+    cell_t* smem_ptr = base + offset;
+    cp_async::load_128b<cp_async::PrefetchMode::kPrefetch>(smem_ptr,
+                                                           reinterpret_cast<const cell_t*>(gptr));
+  }
+  template <typename T>
+  __device__ __forceinline__ void store_128b(uint32_t offset, T* gptr) {
+    *reinterpret_cast<cell_t*>(gptr) = *(base + offset);
+  }
+};
+}  // namespace flashinfer
+#endif  // FLASHINFER_PERMUTED_SMEM_CUH_

flashinfer/vec_dtypes.cuh ADDED Viewed

	@@ -0,0 +1,1262 @@

+/*
+ * Copyright (c) 2023 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef VEC_DTYPES_CUH_
+#define VEC_DTYPES_CUH_
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#ifdef FLASHINFER_ENABLE_FP8
+#include <cuda_fp8.h>
+#endif
+#include <cuda_runtime.h>
+#include <type_traits>
+namespace flashinfer {
+#define FLASHINFER_INLINE inline __attribute__((always_inline)) __device__ __host__
+template <typename float_t, size_t vec_size>
+struct vec_t {
+  FLASHINFER_INLINE float_t& operator[](size_t i);
+  FLASHINFER_INLINE const float_t& operator[](size_t i) const;
+  FLASHINFER_INLINE void fill(float_t val);
+  FLASHINFER_INLINE void load(const float_t* ptr);
+  FLASHINFER_INLINE void store(float_t* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size>& src);
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr);
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const;
+  FLASHINFER_INLINE static void memcpy(float_t* dst, const float_t* src);
+  FLASHINFER_INLINE float_t* ptr();
+};
+template <typename src_float_t, typename tgt_float_t, size_t vec_size>
+FLASHINFER_INLINE void cast_from_impl(vec_t<tgt_float_t, vec_size>& dst,
+                                      const vec_t<src_float_t, vec_size>& src) {
+#pragma unroll
+  for (size_t i = 0; i < vec_size; ++i) {
+    dst[i] = tgt_float_t(src[i]);
+  }
+}
+template <typename src_float_t, typename tgt_float_t, size_t vec_size>
+FLASHINFER_INLINE void cast_load_impl(vec_t<tgt_float_t, vec_size>& dst,
+                                      const src_float_t* src_ptr) {
+  if constexpr (std::is_same<src_float_t, tgt_float_t>::value) {
+    dst.load(src_ptr);
+  } else {
+    vec_t<src_float_t, vec_size> tmp;
+    tmp.load(src_ptr);
+    dst.cast_from(tmp);
+  }
+}
+template <typename src_float_t, typename tgt_float_t, size_t vec_size>
+FLASHINFER_INLINE void cast_store_impl(tgt_float_t* dst_ptr,
+                                       const vec_t<src_float_t, vec_size>& src) {
+  if constexpr (std::is_same<src_float_t, tgt_float_t>::value) {
+    src.store(dst_ptr);
+  } else {
+    vec_t<tgt_float_t, vec_size> tmp;
+    tmp.cast_from(src);
+    tmp.store(dst_ptr);
+  }
+}
+#ifdef FLASHINFER_ENABLE_FP8
+/******************* vec_t<__nv_fp8_e4m3> *******************/
+// __nv_fp8_e4m3 x 1
+template <>
+struct vec_t<__nv_fp8_e4m3, 1> {
+  __nv_fp8_e4m3 data;
+  FLASHINFER_INLINE __nv_fp8_e4m3& operator[](size_t i) { return ((__nv_fp8_e4m3*)(&data))[i]; }
+  FLASHINFER_INLINE const __nv_fp8_e4m3& operator[](size_t i) const {
+    return ((const __nv_fp8_e4m3*)(&data))[i];
+  }
+  FLASHINFER_INLINE __nv_fp8_e4m3* ptr() { return reinterpret_cast<__nv_fp8_e4m3*>(&data); }
+  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
+  FLASHINFER_INLINE void load(const __nv_fp8_e4m3* ptr);
+  FLASHINFER_INLINE void store(__nv_fp8_e4m3* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 1>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3* dst, const __nv_fp8_e4m3* src);
+};
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::fill(__nv_fp8_e4m3 val) { data = val; }
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::load(const __nv_fp8_e4m3* ptr) { data = *ptr; }
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::store(__nv_fp8_e4m3* ptr) const { *ptr = data; }
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::memcpy(__nv_fp8_e4m3* dst,
+                                                       const __nv_fp8_e4m3* src) {
+  *dst = *src;
+}
+// __nv_fp8_e4m3 x 2
+template <>
+struct vec_t<__nv_fp8_e4m3, 2> {
+  __nv_fp8x2_e4m3 data;
+  FLASHINFER_INLINE __nv_fp8_e4m3& operator[](size_t i) { return ((__nv_fp8_e4m3*)(&data))[i]; }
+  FLASHINFER_INLINE const __nv_fp8_e4m3& operator[](size_t i) const {
+    return ((const __nv_fp8_e4m3*)(&data))[i];
+  }
+  FLASHINFER_INLINE __nv_fp8_e4m3* ptr() { return reinterpret_cast<__nv_fp8_e4m3*>(&data); }
+  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
+  FLASHINFER_INLINE void load(const __nv_fp8_e4m3* ptr);
+  FLASHINFER_INLINE void store(__nv_fp8_e4m3* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 2>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3* dst, const __nv_fp8_e4m3* src);
+};
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::fill(__nv_fp8_e4m3 val) {
+  data.__x = (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x);
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::load(const __nv_fp8_e4m3* ptr) {
+  data = *((__nv_fp8x2_e4m3*)ptr);
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::store(__nv_fp8_e4m3* ptr) const {
+  *((__nv_fp8x2_e4m3*)ptr) = data;
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::memcpy(__nv_fp8_e4m3* dst,
+                                                       const __nv_fp8_e4m3* src) {
+  *((__nv_fp8x2_e4m3*)dst) = *((__nv_fp8x2_e4m3*)src);
+}
+// __nv_fp8_e4m3 x 4
+template <>
+struct vec_t<__nv_fp8_e4m3, 4> {
+  __nv_fp8x4_e4m3 data;
+  FLASHINFER_INLINE __nv_fp8_e4m3& operator[](size_t i) { return ((__nv_fp8_e4m3*)(&data))[i]; }
+  FLASHINFER_INLINE const __nv_fp8_e4m3& operator[](size_t i) const {
+    return ((const __nv_fp8_e4m3*)(&data))[i];
+  }
+  FLASHINFER_INLINE __nv_fp8_e4m3* ptr() { return reinterpret_cast<__nv_fp8_e4m3*>(&data); }
+  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
+  FLASHINFER_INLINE void load(const __nv_fp8_e4m3* ptr);
+  FLASHINFER_INLINE void store(__nv_fp8_e4m3* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 4>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3* dst, const __nv_fp8_e4m3* src);
+};
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::fill(__nv_fp8_e4m3 val) {
+  data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+             (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::load(const __nv_fp8_e4m3* ptr) {
+  data = *((__nv_fp8x4_e4m3*)ptr);
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::store(__nv_fp8_e4m3* ptr) const {
+  *((__nv_fp8x4_e4m3*)ptr) = data;
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::memcpy(__nv_fp8_e4m3* dst,
+                                                       const __nv_fp8_e4m3* src) {
+  *((__nv_fp8x4_e4m3*)dst) = *((__nv_fp8x4_e4m3*)src);
+}
+// __nv_fp8_e4m3 x 8
+template <>
+struct vec_t<__nv_fp8_e4m3, 8> {
+  uint2 data;
+  FLASHINFER_INLINE __nv_fp8_e4m3& operator[](size_t i) { return ((__nv_fp8_e4m3*)(&data))[i]; }
+  FLASHINFER_INLINE const __nv_fp8_e4m3& operator[](size_t i) const {
+    return ((const __nv_fp8_e4m3*)(&data))[i];
+  }
+  FLASHINFER_INLINE __nv_fp8_e4m3* ptr() { return reinterpret_cast<__nv_fp8_e4m3*>(&data); }
+  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
+  FLASHINFER_INLINE void load(const __nv_fp8_e4m3* ptr);
+  FLASHINFER_INLINE void store(__nv_fp8_e4m3* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 8>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3* dst, const __nv_fp8_e4m3* src);
+};
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::fill(__nv_fp8_e4m3 val) {
+  ((__nv_fp8x4_e4m3*)(&data.x))->__x =
+      (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+      (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+  ((__nv_fp8x4_e4m3*)(&data.y))->__x =
+      (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+      (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::load(const __nv_fp8_e4m3* ptr) {
+  data = *((uint2*)ptr);
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::store(__nv_fp8_e4m3* ptr) const {
+  *((uint2*)ptr) = data;
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::memcpy(__nv_fp8_e4m3* dst,
+                                                       const __nv_fp8_e4m3* src) {
+  *((uint2*)dst) = *((uint2*)src);
+}
+// __nv_fp8_e4m3 x 16 or more
+template <size_t vec_size>
+struct vec_t<__nv_fp8_e4m3, vec_size> {
+  uint4 data[vec_size / 16];
+  FLASHINFER_INLINE __nv_fp8_e4m3& operator[](size_t i) { return ((__nv_fp8_e4m3*)data)[i]; }
+  FLASHINFER_INLINE const __nv_fp8_e4m3& operator[](size_t i) const {
+    return ((const __nv_fp8_e4m3*)data)[i];
+  }
+  FLASHINFER_INLINE __nv_fp8_e4m3* ptr() { return reinterpret_cast<__nv_fp8_e4m3*>(&data); }
+  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 16; ++i) {
+      ((__nv_fp8x4_e4m3*)(&(data[i].x)))->__x =
+          (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+      ((__nv_fp8x4_e4m3*)(&(data[i].y)))->__x =
+          (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+      ((__nv_fp8x4_e4m3*)(&(data[i].z)))->__x =
+          (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+      ((__nv_fp8x4_e4m3*)(&(data[i].w)))->__x =
+          (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+    }
+  }
+  FLASHINFER_INLINE void load(const __nv_fp8_e4m3* ptr) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 16; ++i) {
+      data[i] = ((uint4*)ptr)[i];
+    }
+  }
+  FLASHINFER_INLINE void store(__nv_fp8_e4m3* ptr) const {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 16; ++i) {
+      ((uint4*)ptr)[i] = data[i];
+    }
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3* dst, const __nv_fp8_e4m3* src) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 16; ++i) {
+      ((uint4*)dst)[i] = ((uint4*)src)[i];
+    }
+  }
+};
+/******************* vec_t<__nv_fp8_e5m2> *******************/
+// __nv_fp8_e5m2 x 1
+template <>
+struct vec_t<__nv_fp8_e5m2, 1> {
+  __nv_fp8_e5m2 data;
+  FLASHINFER_INLINE __nv_fp8_e5m2& operator[](size_t i) { return ((__nv_fp8_e5m2*)(&data))[i]; }
+  FLASHINFER_INLINE const __nv_fp8_e5m2& operator[](size_t i) const {
+    return ((const __nv_fp8_e5m2*)(&data))[i];
+  }
+  FLASHINFER_INLINE __nv_fp8_e5m2* ptr() { return reinterpret_cast<__nv_fp8_e5m2*>(&data); }
+  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
+  FLASHINFER_INLINE void load(const __nv_fp8_e5m2* ptr);
+  FLASHINFER_INLINE void store(__nv_fp8_e5m2* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 1>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2* dst, const __nv_fp8_e5m2* src);
+};
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::fill(__nv_fp8_e5m2 val) { data = val; }
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::load(const __nv_fp8_e5m2* ptr) { data = *ptr; }
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::store(__nv_fp8_e5m2* ptr) const { *ptr = data; }
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::memcpy(__nv_fp8_e5m2* dst,
+                                                       const __nv_fp8_e5m2* src) {
+  *dst = *src;
+}
+// __nv_fp8_e5m2 x 2
+template <>
+struct vec_t<__nv_fp8_e5m2, 2> {
+  __nv_fp8x2_e5m2 data;
+  FLASHINFER_INLINE __nv_fp8_e5m2& operator[](size_t i) { return ((__nv_fp8_e5m2*)(&data))[i]; }
+  FLASHINFER_INLINE const __nv_fp8_e5m2& operator[](size_t i) const {
+    return ((const __nv_fp8_e5m2*)(&data))[i];
+  }
+  FLASHINFER_INLINE __nv_fp8_e5m2* ptr() { return reinterpret_cast<__nv_fp8_e5m2*>(&data); }
+  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
+  FLASHINFER_INLINE void load(const __nv_fp8_e5m2* ptr);
+  FLASHINFER_INLINE void store(__nv_fp8_e5m2* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 2>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2* dst, const __nv_fp8_e5m2* src);
+};
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::fill(__nv_fp8_e5m2 val) {
+  data.__x = (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x);
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::load(const __nv_fp8_e5m2* ptr) {
+  data = *((__nv_fp8x2_e5m2*)ptr);
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::store(__nv_fp8_e5m2* ptr) const {
+  *((__nv_fp8x2_e5m2*)ptr) = data;
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::memcpy(__nv_fp8_e5m2* dst,
+                                                       const __nv_fp8_e5m2* src) {
+  *((__nv_fp8x2_e5m2*)dst) = *((__nv_fp8x2_e5m2*)src);
+}
+// __nv_fp8_e5m2 x 4
+template <>
+struct vec_t<__nv_fp8_e5m2, 4> {
+  __nv_fp8x4_e5m2 data;
+  FLASHINFER_INLINE __nv_fp8_e5m2& operator[](size_t i) { return ((__nv_fp8_e5m2*)(&data))[i]; }
+  FLASHINFER_INLINE const __nv_fp8_e5m2& operator[](size_t i) const {
+    return ((const __nv_fp8_e5m2*)(&data))[i];
+  }
+  FLASHINFER_INLINE __nv_fp8_e5m2* ptr() { return reinterpret_cast<__nv_fp8_e5m2*>(&data); }
+  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
+  FLASHINFER_INLINE void load(const __nv_fp8_e5m2* ptr);
+  FLASHINFER_INLINE void store(__nv_fp8_e5m2* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 4>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2* dst, const __nv_fp8_e5m2* src);
+};
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::fill(__nv_fp8_e5m2 val) {
+  data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+             (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::load(const __nv_fp8_e5m2* ptr) {
+  data = *((__nv_fp8x4_e5m2*)ptr);
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::store(__nv_fp8_e5m2* ptr) const {
+  *((__nv_fp8x4_e5m2*)ptr) = data;
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::memcpy(__nv_fp8_e5m2* dst,
+                                                       const __nv_fp8_e5m2* src) {
+  *((__nv_fp8x4_e5m2*)dst) = *((__nv_fp8x4_e5m2*)src);
+}
+// __nv_fp8_e5m2 x 8
+template <>
+struct vec_t<__nv_fp8_e5m2, 8> {
+  uint2 data;
+  FLASHINFER_INLINE __nv_fp8_e5m2& operator[](size_t i) { return ((__nv_fp8_e5m2*)(&data))[i]; }
+  FLASHINFER_INLINE const __nv_fp8_e5m2& operator[](size_t i) const {
+    return ((const __nv_fp8_e5m2*)(&data))[i];
+  }
+  FLASHINFER_INLINE __nv_fp8_e5m2* ptr() { return reinterpret_cast<__nv_fp8_e5m2*>(&data); }
+  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
+  FLASHINFER_INLINE void load(const __nv_fp8_e5m2* ptr);
+  FLASHINFER_INLINE void store(__nv_fp8_e5m2* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 8>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2* dst, const __nv_fp8_e5m2* src);
+};
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::fill(__nv_fp8_e5m2 val) {
+  ((__nv_fp8x4_e5m2*)(&data.x))->__x =
+      (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+      (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+  ((__nv_fp8x4_e5m2*)(&data.y))->__x =
+      (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+      (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::load(const __nv_fp8_e5m2* ptr) {
+  data = *((uint2*)ptr);
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::store(__nv_fp8_e5m2* ptr) const {
+  *((uint2*)ptr) = data;
+}
+FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::memcpy(__nv_fp8_e5m2* dst,
+                                                       const __nv_fp8_e5m2* src) {
+  *((uint2*)dst) = *((uint2*)src);
+}
+// __nv_fp8_e5m2 x 16 or more
+template <size_t vec_size>
+struct vec_t<__nv_fp8_e5m2, vec_size> {
+  uint4 data[vec_size / 16];
+  FLASHINFER_INLINE __nv_fp8_e5m2& operator[](size_t i) { return ((__nv_fp8_e5m2*)data)[i]; }
+  FLASHINFER_INLINE const __nv_fp8_e5m2& operator[](size_t i) const {
+    return ((const __nv_fp8_e5m2*)data)[i];
+  }
+  FLASHINFER_INLINE __nv_fp8_e5m2* ptr() { return reinterpret_cast<__nv_fp8_e5m2*>(&data); }
+  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 16; ++i) {
+      ((__nv_fp8x4_e5m2*)(&(data[i].x)))->__x =
+          (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+      ((__nv_fp8x4_e5m2*)(&(data[i].y)))->__x =
+          (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+      ((__nv_fp8x4_e5m2*)(&(data[i].z)))->__x =
+          (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+      ((__nv_fp8x4_e5m2*)(&(data[i].w)))->__x =
+          (__nv_fp8x4_storage_t(val.__x) << 24) | (__nv_fp8x4_storage_t(val.__x) << 16) |
+          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
+    }
+  }
+  FLASHINFER_INLINE void load(const __nv_fp8_e5m2* ptr) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 16; ++i) {
+      data[i] = ((uint4*)ptr)[i];
+    }
+  }
+  FLASHINFER_INLINE void store(__nv_fp8_e5m2* ptr) const {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 16; ++i) {
+      ((uint4*)ptr)[i] = data[i];
+    }
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2* dst, const __nv_fp8_e5m2* src) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 16; ++i) {
+      ((uint4*)dst)[i] = ((uint4*)src)[i];
+    }
+  }
+};
+#endif
+/******************* vec_t<half> *******************/
+// half x 1
+template <>
+struct vec_t<half, 1> {
+  half data;
+  FLASHINFER_INLINE half& operator[](size_t i) { return ((half*)(&data))[i]; }
+  FLASHINFER_INLINE const half& operator[](size_t i) const { return ((const half*)(&data))[i]; }
+  FLASHINFER_INLINE half* ptr() { return reinterpret_cast<half*>(&data); }
+  FLASHINFER_INLINE void fill(half val);
+  FLASHINFER_INLINE void load(const half* ptr);
+  FLASHINFER_INLINE void store(half* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 1>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(half* dst, const half* src);
+};
+FLASHINFER_INLINE void vec_t<half, 1>::fill(half val) { data = val; }
+FLASHINFER_INLINE void vec_t<half, 1>::load(const half* ptr) { data = *ptr; }
+FLASHINFER_INLINE void vec_t<half, 1>::store(half* ptr) const { *ptr = data; }
+FLASHINFER_INLINE void vec_t<half, 1>::memcpy(half* dst, const half* src) { *dst = *src; }
+// half x 2
+template <>
+struct vec_t<half, 2> {
+  half2 data;
+  FLASHINFER_INLINE half& operator[](size_t i) { return ((half*)(&data))[i]; }
+  FLASHINFER_INLINE const half& operator[](size_t i) const { return ((const half*)(&data))[i]; }
+  FLASHINFER_INLINE half* ptr() { return reinterpret_cast<half*>(&data); }
+  FLASHINFER_INLINE void fill(half val);
+  FLASHINFER_INLINE void load(const half* ptr);
+  FLASHINFER_INLINE void store(half* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 2>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(half* dst, const half* src);
+};
+FLASHINFER_INLINE void vec_t<half, 2>::fill(half val) { data = make_half2(val, val); }
+FLASHINFER_INLINE void vec_t<half, 2>::load(const half* ptr) { data = *((half2*)ptr); }
+FLASHINFER_INLINE void vec_t<half, 2>::store(half* ptr) const { *((half2*)ptr) = data; }
+FLASHINFER_INLINE void vec_t<half, 2>::memcpy(half* dst, const half* src) {
+  *((half2*)dst) = *((half2*)src);
+}
+// half x 4
+template <>
+struct vec_t<half, 4> {
+  uint2 data;
+  FLASHINFER_INLINE half& operator[](size_t i) { return ((half*)(&data))[i]; }
+  FLASHINFER_INLINE const half& operator[](size_t i) const { return ((const half*)(&data))[i]; }
+  FLASHINFER_INLINE half* ptr() { return reinterpret_cast<half*>(&data); }
+  FLASHINFER_INLINE void fill(half val);
+  FLASHINFER_INLINE void load(const half* ptr);
+  FLASHINFER_INLINE void store(half* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 4>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(half* dst, const half* src);
+};
+FLASHINFER_INLINE void vec_t<half, 4>::fill(half val) {
+  *(half2*)(&data.x) = make_half2(val, val);
+  *(half2*)(&data.y) = make_half2(val, val);
+}
+FLASHINFER_INLINE void vec_t<half, 4>::load(const half* ptr) { data = *((uint2*)ptr); }
+FLASHINFER_INLINE void vec_t<half, 4>::store(half* ptr) const { *((uint2*)ptr) = data; }
+FLASHINFER_INLINE void vec_t<half, 4>::memcpy(half* dst, const half* src) {
+  *((uint2*)dst) = *((uint2*)src);
+}
+// half x 8 or more
+template <size_t vec_size>
+struct vec_t<half, vec_size> {
+  uint4 data[vec_size / 8];
+  FLASHINFER_INLINE half& operator[](size_t i) { return ((half*)data)[i]; }
+  FLASHINFER_INLINE const half& operator[](size_t i) const { return ((const half*)data)[i]; }
+  FLASHINFER_INLINE half* ptr() { return reinterpret_cast<half*>(&data); }
+  FLASHINFER_INLINE void fill(half val) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      *(half2*)(&(data[i].x)) = make_half2(val, val);
+      *(half2*)(&(data[i].y)) = make_half2(val, val);
+      *(half2*)(&(data[i].z)) = make_half2(val, val);
+      *(half2*)(&(data[i].w)) = make_half2(val, val);
+    }
+  }
+  FLASHINFER_INLINE void load(const half* ptr) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      data[i] = ((uint4*)ptr)[i];
+    }
+  }
+  FLASHINFER_INLINE void store(half* ptr) const {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      ((uint4*)ptr)[i] = data[i];
+    }
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(half* dst, const half* src) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      ((uint4*)dst)[i] = ((uint4*)src)[i];
+    }
+  }
+};
+/******************* vec_t<nv_bfloat16> *******************/
+// nv_bfloat16 x 1
+template <>
+struct vec_t<nv_bfloat16, 1> {
+  nv_bfloat16 data;
+  FLASHINFER_INLINE nv_bfloat16& operator[](size_t i) { return ((nv_bfloat16*)(&data))[i]; }
+  FLASHINFER_INLINE const nv_bfloat16& operator[](size_t i) const {
+    return ((const nv_bfloat16*)(&data))[i];
+  }
+  FLASHINFER_INLINE nv_bfloat16* ptr() { return reinterpret_cast<nv_bfloat16*>(&data); }
+  FLASHINFER_INLINE void fill(nv_bfloat16 val);
+  FLASHINFER_INLINE void load(const nv_bfloat16* ptr);
+  FLASHINFER_INLINE void store(nv_bfloat16* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 1>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(nv_bfloat16* dst, const nv_bfloat16* src);
+};
+FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::fill(nv_bfloat16 val) { data = val; }
+FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::load(const nv_bfloat16* ptr) { data = *ptr; }
+FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::store(nv_bfloat16* ptr) const { *ptr = data; }
+FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::memcpy(nv_bfloat16* dst, const nv_bfloat16* src) {
+  *dst = *src;
+}
+// nv_bfloat16 x 2
+template <>
+struct vec_t<nv_bfloat16, 2> {
+  nv_bfloat162 data;
+  FLASHINFER_INLINE nv_bfloat16& operator[](size_t i) { return ((nv_bfloat16*)(&data))[i]; }
+  FLASHINFER_INLINE const nv_bfloat16& operator[](size_t i) const {
+    return ((const nv_bfloat16*)(&data))[i];
+  }
+  FLASHINFER_INLINE nv_bfloat16* ptr() { return reinterpret_cast<nv_bfloat16*>(&data); }
+  FLASHINFER_INLINE void fill(nv_bfloat16 val);
+  FLASHINFER_INLINE void load(const nv_bfloat16* ptr);
+  FLASHINFER_INLINE void store(nv_bfloat16* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 2>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(nv_bfloat16* dst, const nv_bfloat16* src);
+};
+FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::fill(nv_bfloat16 val) {
+  data = make_bfloat162(val, val);
+}
+FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::load(const nv_bfloat16* ptr) {
+  data = *((nv_bfloat162*)ptr);
+}
+FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::store(nv_bfloat16* ptr) const {
+  *((nv_bfloat162*)ptr) = data;
+}
+FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::memcpy(nv_bfloat16* dst, const nv_bfloat16* src) {
+  *((nv_bfloat162*)dst) = *((nv_bfloat162*)src);
+}
+// nv_bfloat16 x 4
+template <>
+struct vec_t<nv_bfloat16, 4> {
+  uint2 data;
+  FLASHINFER_INLINE nv_bfloat16& operator[](size_t i) { return ((nv_bfloat16*)(&data))[i]; }
+  FLASHINFER_INLINE const nv_bfloat16& operator[](size_t i) const {
+    return ((const nv_bfloat16*)(&data))[i];
+  }
+  FLASHINFER_INLINE nv_bfloat16* ptr() { return reinterpret_cast<nv_bfloat16*>(&data); }
+  FLASHINFER_INLINE void fill(nv_bfloat16 val);
+  FLASHINFER_INLINE void load(const nv_bfloat16* ptr);
+  FLASHINFER_INLINE void store(nv_bfloat16* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 4>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(nv_bfloat16* dst, const nv_bfloat16* src);
+};
+FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::fill(nv_bfloat16 val) {
+  *(nv_bfloat162*)(&data.x) = make_bfloat162(val, val);
+  *(nv_bfloat162*)(&data.y) = make_bfloat162(val, val);
+}
+FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::load(const nv_bfloat16* ptr) {
+  data = *((uint2*)ptr);
+}
+FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::store(nv_bfloat16* ptr) const {
+  *((uint2*)ptr) = data;
+}
+FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::memcpy(nv_bfloat16* dst, const nv_bfloat16* src) {
+  *((uint2*)dst) = *((uint2*)src);
+}
+// nv_bfloat16 x 8 or more
+template <size_t vec_size>
+struct vec_t<nv_bfloat16, vec_size> {
+  uint4 data[vec_size / 8];
+  FLASHINFER_INLINE nv_bfloat16& operator[](size_t i) { return ((nv_bfloat16*)data)[i]; }
+  FLASHINFER_INLINE const nv_bfloat16& operator[](size_t i) const {
+    return ((const nv_bfloat16*)data)[i];
+  }
+  FLASHINFER_INLINE nv_bfloat16* ptr() { return reinterpret_cast<nv_bfloat16*>(&data); }
+  FLASHINFER_INLINE void fill(nv_bfloat16 val) {
+#pragma unoll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      *(nv_bfloat162*)(&(data[i].x)) = make_bfloat162(val, val);
+      *(nv_bfloat162*)(&(data[i].y)) = make_bfloat162(val, val);
+      *(nv_bfloat162*)(&(data[i].z)) = make_bfloat162(val, val);
+      *(nv_bfloat162*)(&(data[i].w)) = make_bfloat162(val, val);
+    }
+  }
+  FLASHINFER_INLINE void load(const nv_bfloat16* ptr) {
+#pragma unoll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      data[i] = ((uint4*)ptr)[i];
+    }
+  }
+  FLASHINFER_INLINE void store(nv_bfloat16* ptr) const {
+#pragma unoll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      ((uint4*)ptr)[i] = data[i];
+    }
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(nv_bfloat16* dst, const nv_bfloat16* src) {
+#pragma unoll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      ((uint4*)dst)[i] = ((uint4*)src)[i];
+    }
+  }
+};
+/******************* vec_t<float> *******************/
+// float x 1
+template <>
+struct vec_t<float, 1> {
+  float data;
+  FLASHINFER_INLINE float& operator[](size_t i) { return ((float*)(&data))[i]; }
+  FLASHINFER_INLINE const float& operator[](size_t i) const { return ((const float*)(&data))[i]; }
+  FLASHINFER_INLINE float* ptr() { return reinterpret_cast<float*>(&data); }
+  FLASHINFER_INLINE void fill(float val);
+  FLASHINFER_INLINE void load(const float* ptr);
+  FLASHINFER_INLINE void store(float* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 1>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(float* dst, const float* src);
+};
+FLASHINFER_INLINE void vec_t<float, 1>::fill(float val) { data = val; }
+FLASHINFER_INLINE void vec_t<float, 1>::load(const float* ptr) { data = *ptr; }
+FLASHINFER_INLINE void vec_t<float, 1>::store(float* ptr) const { *ptr = data; }
+FLASHINFER_INLINE void vec_t<float, 1>::memcpy(float* dst, const float* src) { *dst = *src; }
+// float x 2
+template <>
+struct vec_t<float, 2> {
+  float2 data;
+  FLASHINFER_INLINE float& operator[](size_t i) { return ((float*)(&data))[i]; }
+  FLASHINFER_INLINE const float& operator[](size_t i) const { return ((const float*)(&data))[i]; }
+  FLASHINFER_INLINE float* ptr() { return reinterpret_cast<float*>(&data); }
+  FLASHINFER_INLINE void fill(float val);
+  FLASHINFER_INLINE void load(const float* ptr);
+  FLASHINFER_INLINE void store(float* ptr) const;
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, 2>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(float* dst, const float* src);
+};
+FLASHINFER_INLINE void vec_t<float, 2>::fill(float val) { data = make_float2(val, val); }
+FLASHINFER_INLINE void vec_t<float, 2>::load(const float* ptr) { data = *((float2*)ptr); }
+FLASHINFER_INLINE void vec_t<float, 2>::store(float* ptr) const { *((float2*)ptr) = data; }
+FLASHINFER_INLINE void vec_t<float, 2>::memcpy(float* dst, const float* src) {
+  *((float2*)dst) = *((float2*)src);
+}
+// float x 4 or more
+template <size_t vec_size>
+struct vec_t<float, vec_size> {
+  float4 data[vec_size / 4];
+  FLASHINFER_INLINE float& operator[](size_t i) { return ((float*)(data))[i]; }
+  FLASHINFER_INLINE const float& operator[](size_t i) const { return ((const float*)(data))[i]; }
+  FLASHINFER_INLINE float* ptr() { return reinterpret_cast<float*>(&data); }
+  FLASHINFER_INLINE void fill(float val) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      data[i] = make_float4(val, val, val, val);
+    }
+  }
+  FLASHINFER_INLINE void load(const float* ptr) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      data[i] = ((float4*)ptr)[i];
+    }
+  }
+  FLASHINFER_INLINE void store(float* ptr) const {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      ((float4*)ptr)[i] = data[i];
+    }
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  FLASHINFER_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+  FLASHINFER_INLINE static void memcpy(float* dst, const float* src) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      ((float4*)dst)[i] = ((float4*)src)[i];
+    }
+  }
+};
+/******************* vec_t type cast *******************/
+template <typename dst_t, typename src_t, size_t vec_size>
+FLASHINFER_INLINE void vec_cast(dst_t* dst, const src_t* src) {
+#pragma unroll
+  for (size_t i = 0; i < vec_size; ++i) {
+    dst[i] = src[i];
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void vec_cast<float, half>(float* dst, const half* src) {
+#pragma unroll
+  for (size_t i = 0; i < vec_size / 2; ++i) {
+    ((float2*)dst)[i] = __half22float2(((half2*)src)[i]);
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void vec_cast<half, float>(half* dst, const float* src) {
+#pragma unroll
+  for (size_t i = 0; i < vec_size / 2; ++i) {
+    ((half2*)dst)[i] = __float22half2_rn(((float2*)src)[i]);
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void vec_cast<float, nv_bfloat16>(float* dst, const nv_bfloat16* src) {
+#pragma unroll
+  for (size_t i = 0; i < vec_size / 2; ++i) {
+    ((float2*)dst)[i] = __bfloat1622float2(((nv_bfloat162*)src)[i]);
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void vec_cast<nv_bfloat16, float>(nv_bfloat16* dst, const float* src) {
+#pragma unroll
+  for (size_t i = 0; i < vec_size / 2; ++i) {
+    ((nv_bfloat162*)dst)[i] = __float22bfloat162_rn(((float2*)src)[i]);
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void cast_from_impl(vec_t<float, vec_size>& dst,
+                                      const vec_t<half, vec_size>& src) {
+  if constexpr (vec_size == 1) {
+    dst.data = float(src.data);
+  } else {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 2; ++i) {
+      ((float2*)(&dst.data))[i] = __half22float2(((half2*)(&src.data))[i]);
+    }
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void cast_from_impl(vec_t<half, vec_size>& dst,
+                                      const vec_t<float, vec_size>& src) {
+  if constexpr (vec_size == 1) {
+    dst.data = half(src.data);
+  } else {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 2; ++i) {
+      ((half2*)(&dst.data))[i] = __float22half2_rn(((float2*)(&src.data))[i]);
+    }
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void cast_from_impl(vec_t<float, vec_size>& dst,
+                                      const vec_t<nv_bfloat16, vec_size>& src) {
+  if constexpr (vec_size == 1) {
+    dst.data = float(src.data);
+  } else {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 2; ++i) {
+      ((float2*)(&dst.data))[i] = __bfloat1622float2(((nv_bfloat162*)(&src.data))[i]);
+    }
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void cast_from_impl(vec_t<nv_bfloat16, vec_size>& dst,
+                                      const vec_t<float, vec_size>& src) {
+  if constexpr (vec_size == 1) {
+    dst.data = nv_bfloat16(src.data);
+  } else {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 2; ++i) {
+      ((nv_bfloat162*)(&dst.data))[i] = __float22bfloat162_rn(((float2*)(&src.data))[i]);
+    }
+  }
+}
+#ifdef FLASHINFER_ENABLE_FP8
+template <size_t vec_size>
+FLASHINFER_INLINE void cast_from_impl(vec_t<float, vec_size>& dst,
+                                      const vec_t<__nv_fp8_e4m3, vec_size>& src) {
+  if constexpr (vec_size == 1) {
+    dst.data = float(src.data);
+  } else if constexpr (vec_size == 2) {
+    *(float2*)(&dst.data) = float2(*(__nv_fp8x2_e4m3*)(&src.data));
+  } else {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      ((float4*)(&dst.data))[i] = float4(((__nv_fp8x4_e4m3*)(&src.data))[i]);
+    }
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void cast_from_impl(vec_t<half, vec_size>& dst,
+                                      const vec_t<__nv_fp8_e4m3, vec_size>& src) {
+  if constexpr (vec_size == 1) {
+    dst.data = float(src.data);
+  } else {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 2; ++i) {
+      ((half2*)(&dst.data))[i] = half2(((__nv_fp8x2_e4m3*)(&src.data))[i]);
+    }
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void cast_from_impl(vec_t<__nv_fp8_e4m3, vec_size>& dst,
+                                      const vec_t<float, vec_size>& src) {
+  if constexpr (vec_size == 1) {
+    dst.data = __nv_fp8_e4m3(src.data);
+  } else if constexpr (vec_size == 2) {
+    *(__nv_fp8x2_e4m3*)(&dst.data) = __nv_fp8x2_e4m3(*(float2*)(&src.data));
+  } else {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      ((__nv_fp8x4_e4m3*)(&dst.data))[i] = __nv_fp8x4_e4m3(((float4*)(&src.data))[i]);
+    }
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void cast_from_impl(vec_t<__nv_fp8_e4m3, vec_size>& dst,
+                                      const vec_t<half, vec_size>& src) {
+  if constexpr (vec_size == 1) {
+    dst.data = __nv_fp8_e4m3(src.data);
+  } else if constexpr (vec_size == 2) {
+    *(__nv_fp8x2_e4m3*)(&dst.data) = __nv_fp8x2_e4m3(*(half2*)(&src.data));
+  } else {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      // NOTE(Zihao): need to double check if we properly handle flo and fhi
+      ((__nv_fp8x4_e4m3*)(&dst.data))[i] =
+          __nv_fp8x4_e4m3(((half2*)(&src.data))[i * 2], ((half2*)(&src.data))[i * 2 + 1]);
+    }
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void cast_from_impl(vec_t<float, vec_size>& dst,
+                                      const vec_t<__nv_fp8_e5m2, vec_size>& src) {
+  if constexpr (vec_size == 1) {
+    dst.data = float(src.data);
+  } else if constexpr (vec_size == 2) {
+    *(float2*)(&dst.data) = float2(*(__nv_fp8x2_e5m2*)(&src.data));
+  } else {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      ((float4*)(&dst.data))[i] = float4(((__nv_fp8x4_e5m2*)(&src.data))[i]);
+    }
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void cast_from_impl(vec_t<half, vec_size>& dst,
+                                      const vec_t<__nv_fp8_e5m2, vec_size>& src) {
+  if constexpr (vec_size == 1) {
+    dst.data = float(src.data);
+  } else {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 2; ++i) {
+      ((half2*)(&dst.data))[i] = half2(((__nv_fp8x2_e5m2*)(&src.data))[i]);
+    }
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void cast_from_impl(vec_t<__nv_fp8_e5m2, vec_size>& dst,
+                                      const vec_t<float, vec_size>& src) {
+  if constexpr (vec_size == 1) {
+    dst.data = __nv_fp8_e5m2(src.data);
+  } else if constexpr (vec_size == 2) {
+    *(__nv_fp8x2_e5m2*)(&dst.data) = __nv_fp8x2_e5m2(*(float2*)(&src.data));
+  } else {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      ((__nv_fp8x4_e5m2*)(&dst.data))[i] = __nv_fp8x4_e5m2(((float4*)(&src.data))[i]);
+    }
+  }
+}
+template <size_t vec_size>
+FLASHINFER_INLINE void cast_from_impl(vec_t<__nv_fp8_e5m2, vec_size>& dst,
+                                      const vec_t<half, vec_size>& src) {
+  if constexpr (vec_size == 1) {
+    dst.data = __nv_fp8_e5m2(src.data);
+  } else if constexpr (vec_size == 2) {
+    *(__nv_fp8x2_e5m2*)(&dst.data) = __nv_fp8x2_e5m2(*(half2*)(&src.data));
+  } else {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      // NOTE(Zihao): need to double check if we properly handle flo and fhi
+      ((__nv_fp8x4_e5m2*)(&dst.data))[i] =
+          __nv_fp8x4_e5m2(((half2*)(&src.data))[i * 2], ((half2*)(&src.data))[i * 2 + 1]);
+    }
+  }
+}
+#endif  // FLASHINFER_ENABLE_FP8
+}  // namespace flashinfer
+#endif  // VEC_DTYPES_CUH_

punica_kernels/punica_ops.cc ADDED Viewed

	@@ -0,0 +1,220 @@

+#include <c10/cuda/CUDAStream.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <torch/all.h>
+#include <cstdint>
+#include "bgmv/bgmv_config.h"
+#include "sgmv/sgmv.h"
+#include "sgmv_flashinfer/sgmv_config.h"
+//namespace
+//{
+  //====== utils ======
+  inline constexpr uint64_t pack_u32(uint32_t a, uint32_t b)
+  {
+    return (uint64_t(a) << 32) | uint64_t(b);
+  }
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_DIM(d, x) \
+  TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
+#define CHECK_EQ(a, b) \
+  TORCH_CHECK((a) == (b), "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
+#define CHECK_GE(a, b) \
+  TORCH_CHECK((a) >= (b), "CHECK_GE(" #a ", " #b ") failed. ", a, " vs ", b)
+  //====== dispatch pytorch dtype ======
+#define _DISPATCH_SWITCH(cond, ...) \
+  [&]() -> bool {                   \
+    switch (cond)                   \
+    {                               \
+      __VA_ARGS__                   \
+    default:                        \
+      return false;                 \
+    }                               \
+  }()
+#define _DISPATCH_DTYPE_CASE(enum_type, c_type_, ...) \
+  case enum_type:                                     \
+  {                                                   \
+    using c_type = c_type_;                           \
+    return __VA_ARGS__();                             \
+  }
+#define _DISPATCH_DTYPE_CASES(...)                                 \
+  _DISPATCH_DTYPE_CASE(at::ScalarType::Half, nv_half, __VA_ARGS__) \
+  _DISPATCH_DTYPE_CASE(at::ScalarType::BFloat16, nv_bfloat16, __VA_ARGS__)
+#define DISPATCH_TORCH_DTYPE(scalar_type, ...) \
+  _DISPATCH_SWITCH(scalar_type, _DISPATCH_DTYPE_CASES(__VA_ARGS__))
+  //====== bgmv ======
+  template <typename T>
+  inline bool launch_bgmv_kernel(T *Y, const T *X, T **W,
+                                 const int64_t *lora_indices,
+                                 uint16_t in_features, uint16_t out_features,
+                                 int64_t y_offset, int64_t full_y_size,
+                                 int64_t batch_size,
+                                 int64_t layer_idx, float scale)
+  {
+    switch (pack_u32(in_features, out_features))
+    {
+#define CASE_ONESIDE(_T, feat_in, feat_out)                         \
+  case pack_u32(feat_in, feat_out):                                 \
+    bgmv_kernel<feat_in, feat_out>(Y, X, W, lora_indices, y_offset, \
+                                   full_y_size, batch_size,         \
+                                   layer_idx, scale);               \
+    break;
+#define CASE(_T, narrow, wide)  \
+  CASE_ONESIDE(T, narrow, wide) \
+  CASE_ONESIDE(T, wide, narrow)
+      FOR_BGMV_WIDE_NARROW(CASE, _)
+#undef CASE
+#undef CASE_ONESIDE
+    default:
+      return false;
+    }
+    return true;
+  }
+  void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w_ptr,
+                     torch::Tensor indicies, int64_t layer_idx, double scale)
+  {
+    CHECK_INPUT(y);
+    CHECK_INPUT(x);
+    CHECK_INPUT(w_ptr);
+    CHECK_INPUT(indicies);
+    CHECK_DIM(2, y);
+    CHECK_DIM(2, x);
+    CHECK_DIM(1, w_ptr);
+    CHECK_DIM(1, indicies);
+    int64_t B = x.size(0);
+    int64_t h_in = x.size(1);
+    int64_t h_out = y.size(1);
+    CHECK_EQ(indicies.size(0), x.size(0));
+    CHECK_EQ(y.size(0), x.size(0));
+    bool ok = false;
+    if (h_in < 65536 && h_out < 65536)
+    {
+      switch (x.scalar_type())
+      {
+      case at::ScalarType::Half:
+        ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
+                                static_cast<nv_half *>(x.data_ptr()),
+                                static_cast<nv_half **>(w_ptr.data_ptr()),
+                                indicies.data_ptr<int64_t>(), h_in, h_out, 0, h_out, B,
+                                layer_idx, scale);
+        break;
+      case at::ScalarType::BFloat16:
+        ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
+                                static_cast<nv_bfloat16 *>(x.data_ptr()),
+                                static_cast<nv_bfloat16 **>(w_ptr.data_ptr()),
+                                indicies.data_ptr<int64_t>(), h_in, h_out, 0, h_out, B,
+                                layer_idx, scale);
+        break;
+      default:
+        break;
+      }
+    }
+    TORCH_CHECK(ok, "No suitable kernel.", " h_in=", h_in, " h_out=", h_out,
+                " dtype=", x.scalar_type());
+  }
+  //====== sgmv ======
+  void dispatch_sgmv_cutlass(torch::Tensor y, torch::Tensor x, torch::Tensor w_ptr,
+                             torch::Tensor s_start, torch::Tensor s_end,
+                             torch::Tensor tmp, int64_t layer_idx)
+  {
+    CHECK_INPUT(y);
+    CHECK_INPUT(x);
+    CHECK_INPUT(w_ptr);
+    CHECK_INPUT(s_start);
+    CHECK_INPUT(s_end);
+    CHECK_INPUT(tmp);
+    CHECK_DIM(2, y);
+    CHECK_DIM(2, x);
+    CHECK_DIM(1, w_ptr);
+    CHECK_DIM(1, s_start);
+    CHECK_DIM(1, s_end);
+    CHECK_DIM(1, tmp);
+    int num_problems = s_start.size(0);
+    int d_in = x.size(1);
+    int d_out = y.size(1);
+    CHECK_EQ(tmp.size(0), static_cast<int64_t>(sgmv_tmp_size(num_problems)));
+    cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
+    bool ok = DISPATCH_TORCH_DTYPE(x.scalar_type(), [&]
+                                   { return sgmv<c_type>((c_type *)y.data_ptr(), (c_type *)x.data_ptr(), (c_type **)w_ptr.data_ptr(),
+                                                         s_start.data_ptr<int32_t>(), s_end.data_ptr<int32_t>(),
+                                                         tmp.data_ptr<uint8_t>(), num_problems, d_in, d_out,
+                                                         layer_idx, stream); });
+    TORCH_CHECK(ok, "No suitable kernel.", " dtype=", x.scalar_type());
+  }
+  void dispatch_sgmv_shrink(torch::Tensor y, torch::Tensor x, torch::Tensor w_ptr,
+                            torch::Tensor s_start, torch::Tensor s_end, torch::Tensor tmp, int64_t layer_idx)
+  {
+    CHECK_INPUT(y);
+    CHECK_INPUT(x);
+    CHECK_INPUT(w_ptr);
+    CHECK_INPUT(s_start);
+    CHECK_INPUT(s_end);
+    CHECK_INPUT(tmp);
+    CHECK_DIM(2, y);
+    CHECK_DIM(2, x);
+    CHECK_DIM(1, w_ptr);
+    CHECK_DIM(1, s_start);
+    CHECK_DIM(1, s_end);
+    CHECK_DIM(1, tmp);
+    uint32_t num_problems = s_start.size(0);
+    uint32_t d_in = x.size(1);
+    uint32_t d_out = y.size(1);
+    CHECK_EQ(tmp.scalar_type(), at::ScalarType::Byte);
+    CHECK_EQ(tmp.size(0), 8 * 1024 * 1024);
+    cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
+#define CASE(_T, D_OUT)                                                                      \
+  case D_OUT:                                                                                \
+    return sgmv_shrink<c_type, D_OUT>(                                                       \
+        (c_type *)y.data_ptr(), (c_type *)x.data_ptr(),                                      \
+        (c_type **)w_ptr.data_ptr(), s_start.data_ptr<int32_t>(), s_end.data_ptr<int32_t>(), \
+        tmp.data_ptr<uint8_t>(), num_problems, d_in, layer_idx, stream);
+    bool ok = DISPATCH_TORCH_DTYPE(x.scalar_type(), [&]
+                                   {
+    switch (d_out) {
+      FOR_SGMV_NARROW(CASE, c_type);
+      default:
+        return false;
+    } });
+#undef CASE
+    TORCH_CHECK(ok, "No suitable kernel.", " dtype=", x.scalar_type(),
+                " d_out=", d_out);
+  }
+//} // namespace

sgmv/sgmv.h ADDED Viewed

	@@ -0,0 +1,10 @@

+#pragma once
+#include <cuda_runtime.h>
+#include <cstdint>
+template <typename DType>
+bool sgmv(DType *y, DType *x, DType **w, int32_t *s_start, int32_t *s_end,
+          void *tmp_d, int num_problems, int d_in, int d_out, int layer_idx, cudaStream_t stream);
+int64_t sgmv_tmp_size(int64_t num_problems);

sgmv/sgmv_cutlass.cu ADDED Viewed

	@@ -0,0 +1,14 @@

+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include "sgmv_cutlass.cuh"
+template bool sgmv<nv_half>(nv_half *y, nv_half *x, nv_half **w,
+                            int32_t *s_start, int32_t *s_end,
+                            void *tmp_d, int num_problems, int d_in, int d_out,
+                            int layer_idx, cudaStream_t stream);
+template bool sgmv<nv_bfloat16>(nv_bfloat16 *y, nv_bfloat16 *x, nv_bfloat16 **w,
+                                int32_t *s_start, int32_t *s_end,
+                                void *tmp_d, int num_problems, int d_in, int d_out,
+                                int layer_idx, cudaStream_t stream);

sgmv/sgmv_cutlass.cuh ADDED Viewed

	@@ -0,0 +1,180 @@

+#pragma once
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <cstdio>
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+template <typename T>
+struct cutlass_dtype {
+  using type = T;
+};
+template <>
+struct cutlass_dtype<half> {
+  using type = cutlass::half_t;
+};
+template <>
+struct cutlass_dtype<nv_bfloat16> {
+  using type = cutlass::bfloat16_t;
+};
+template <typename T>
+__global__ void precompute_sgmv_args(cutlass::gemm::GemmCoord *all_problems,
+                                     T **ptr_y, T **ptr_x, T **ptr_w,
+                                     int64_t *ld_y, int64_t *ld_x,
+                                     int64_t *ld_w, T *y, T *x, T **w,
+                                     int32_t *s_start, int32_t *s_end,
+                                     int d_in, int d_out,
+                                     int layer_idx) {
+  int i = blockIdx.x;
+  int m = s_end[i] - s_start[i], k = d_in, n = d_out;
+  if (m <= 0) {
+    m = 0;
+    n = 0;
+    k = 0;
+  }
+  all_problems[i] = cutlass::gemm::GemmCoord(m, n, k);
+  ptr_w[i] = w[i] + layer_idx * d_in * d_out;
+  ptr_x[i] = x + s_start[i] * d_in;
+  ptr_y[i] = y + s_start[i] * d_out;
+  ld_x[i] = k;
+  ld_w[i] = n;
+  ld_y[i] = n;
+}
+int64_t sgmv_tmp_size(int64_t num_problems) {
+  constexpr auto sz = sizeof(void *) * 3 + sizeof(int64_t) * 3 +
+                      sizeof(cutlass::gemm::GemmCoord);
+  return sz * num_problems;
+}
+template <typename T>
+inline T *alloc_from_buf(void **buf, int n) {
+  auto *p = (T *)*buf;
+  *buf = (void *)(p + n);
+  return p;
+}
+template <typename DType>
+bool sgmv(DType *y, DType *x, DType **w, int32_t *s_start, int32_t *s_end,
+          void *tmp_d, int num_problems, int d_in, int d_out, int layer_idx,
+          cudaStream_t stream) {
+  using cutlass_t = typename cutlass_dtype<DType>::type;
+  auto ptr_Y = alloc_from_buf<cutlass_t *>(&tmp_d, num_problems);
+  auto ptr_X = alloc_from_buf<cutlass_t *>(&tmp_d, num_problems);
+  auto ptr_W = alloc_from_buf<cutlass_t *>(&tmp_d, num_problems);
+  auto ld_Y = alloc_from_buf<int64_t>(&tmp_d, num_problems);
+  auto ld_X = alloc_from_buf<int64_t>(&tmp_d, num_problems);
+  auto ld_W = alloc_from_buf<int64_t>(&tmp_d, num_problems);
+  auto all_problems =
+      alloc_from_buf<cutlass::gemm::GemmCoord>(&tmp_d, num_problems);
+  precompute_sgmv_args<<<num_problems, 1, 0, stream>>>(
+      all_problems, ptr_Y, ptr_X, ptr_W, ld_Y, ld_X, ld_W, (cutlass_t *)y,
+      (cutlass_t *)x, (cutlass_t **)w, s_start, s_end, d_in, d_out, layer_idx);
+  using cutlass::epilogue::thread::LinearCombination;
+  using cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle;
+  if (d_in < d_out) {
+    // Expand
+    using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+        cutlass_t,                                      // Element A
+        cutlass::layout::RowMajor,                      // Layout A
+        cutlass::ComplexTransform::kNone,               //
+        8,                                              // Granularity A
+        cutlass_t,                                      // Element B
+        cutlass::layout::RowMajor,                      // Layout B
+        cutlass::ComplexTransform::kNone,               //
+        8,                                              // Granularity B
+        cutlass_t,                                      // Element C&D
+        cutlass::layout::RowMajor,                      // Layout C&D
+        float,                                          // Element Accumulator
+        cutlass::arch::OpClassTensorOp,                 // Operator Class Tag
+        cutlass::arch::Sm80,                            // Architecture
+        cutlass::gemm::GemmShape<32, 128, 16>,          // Thread Block Shape
+        cutlass::gemm::GemmShape<32, 64, 16>,           // Warp Shape
+        cutlass::gemm::GemmShape<16, 8, 8>,             // Instruction Shape
+        LinearCombination<cutlass_t, 8, float, float>,  // Epilogue
+        GemmIdentityThreadblockSwizzle<1>,              // Swizzling Operator
+        2                                               // Stages
+        >::GemmKernel;
+    using EpilogueOutputOp = typename GemmKernel::Epilogue::OutputOp;
+    typename EpilogueOutputOp::Params epilogue_op(1.0, 1.0);
+    using GemmGrouped = cutlass::gemm::device::GemmGrouped<GemmKernel>;
+    typename GemmGrouped::Arguments args(all_problems, num_problems, 512,
+                                         epilogue_op, ptr_X, ptr_W, ptr_Y,
+                                         ptr_Y, ld_X, ld_W, ld_Y, ld_Y);
+    GemmGrouped gemm;
+    auto status = gemm.initialize(args, nullptr, stream);
+    if (status != cutlass::Status::kSuccess) {
+      fprintf(stderr, "sgmv_cutlass gemm.initialize failed: %s\n",
+              cutlassGetStatusString(status));
+      return false;
+    }
+    status = gemm.run(stream);
+    if (status != cutlass::Status::kSuccess) {
+      fprintf(stderr, "sgmv_cutlass gemm.run failed: %s\n",
+              cutlassGetStatusString(status));
+      return false;
+    }
+  } else {
+    // Shrink
+    using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+        cutlass_t,                                      // Element A
+        cutlass::layout::RowMajor,                      // Layout A
+        cutlass::ComplexTransform::kNone,               //
+        8,                                              // Granularity A
+        cutlass_t,                                      // Element B
+        cutlass::layout::RowMajor,                      // Layout B
+        cutlass::ComplexTransform::kNone,               //
+        8,                                              // Granularity B
+        cutlass_t,                                      // Element C&D
+        cutlass::layout::RowMajor,                      // Layout C&D
+        float,                                          // Element Accumulator
+        cutlass::arch::OpClassTensorOp,                 // Operator Class Tag
+        cutlass::arch::Sm80,                            // Architecture
+        cutlass::gemm::GemmShape<16, 64, 64>,           // Thread Block Shape
+        cutlass::gemm::GemmShape<16, 16, 64>,           // Warp Shape
+        cutlass::gemm::GemmShape<16, 8, 16>,            // Instruction Shape
+        LinearCombination<cutlass_t, 4, float, float>,  // Epilogue
+        GemmIdentityThreadblockSwizzle<2>,              // Swizzling Operator
+        2                                               // Stages
+        >::GemmKernel;
+    using EpilogueOutputOp = typename GemmKernel::Epilogue::OutputOp;
+    typename EpilogueOutputOp::Params epilogue_op(1.0, 1.0);
+    using GemmGrouped = cutlass::gemm::device::GemmGrouped<GemmKernel>;
+    typename GemmGrouped::Arguments args(all_problems, num_problems, 512,
+                                         epilogue_op, ptr_X, ptr_W, ptr_Y,
+                                         ptr_Y, ld_X, ld_W, ld_Y, ld_Y);
+    GemmGrouped gemm;
+    auto status = gemm.initialize(args, nullptr, stream);
+    if (status != cutlass::Status::kSuccess) {
+      fprintf(stderr, "sgmv_cutlass gemm.initialize failed: %s\n",
+              cutlassGetStatusString(status));
+      return false;
+    }
+    status = gemm.run(stream);
+    if (status != cutlass::Status::kSuccess) {
+      fprintf(stderr, "sgmv_cutlass gemm.run failed: %s\n",
+              cutlassGetStatusString(status));
+      return false;
+    }
+  }
+  return true;
+}

sgmv_flashinfer/sgmv_all.cu ADDED Viewed

	@@ -0,0 +1,73 @@

+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <cstdint>
+#include "sgmv_config.h"
+#include "sgmv_flashinfer.cuh"
+template <typename T, uint32_t d_out>
+bool sgmv_shrink(T* y, T* x, T** w, int32_t* s_start, int32_t* s_end, void* tmp,
+                 uint32_t num_problems, uint32_t d_in, uint32_t layer_idx, cudaStream_t stream) {
+  static_assert(d_out % 16 == 0);
+  constexpr uint32_t num_warps = 4;
+  constexpr uint32_t num_stages = 2;
+  constexpr uint32_t num_k_frags_per_stage = 8;
+  constexpr uint32_t num_blocks_n = d_out / 16;
+  uint32_t smem = num_stages * sizeof(T) * num_k_frags_per_stage * 16 * 16 *
+                  (num_warps + num_blocks_n);
+  auto cooperative_kernel =
+      flashinfer::sgmv::sgmv_shrink<true, T, int, num_warps, d_out>;
+  auto kernel = flashinfer::sgmv::sgmv_shrink<false, T, int, num_warps, d_out>;
+  int dev_id = 0;
+  int num_blocks_per_sm = 0;
+  int num_sm = 0;
+  bool use_cooperative = true;
+  cudaGetDevice(&dev_id);
+  cudaDeviceGetAttribute(&num_sm, cudaDevAttrMultiProcessorCount, dev_id);
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &num_blocks_per_sm, cooperative_kernel, num_warps * 32, smem);
+  const uint32_t max_grid_size = num_sm * num_blocks_per_sm;
+  uint32_t chunk_size = 256;
+  uint32_t num_chunks = (d_in + chunk_size - 1) / chunk_size;
+  if (num_chunks * num_problems > max_grid_size) {
+    use_cooperative = false;
+    chunk_size = d_in;
+    num_chunks = 1;
+  }
+  dim3 nthrs(32, num_warps);
+  dim3 nblks(num_chunks, num_problems);
+  void* args[] = {(void*)&y,    (void*)&x,         (void*)&w,
+                  (void*)&s_start,    (void*)&s_end, (void*)&tmp,       (void*)&num_problems,
+                  (void*)&d_in, (void*)&layer_idx, (void*)&chunk_size};
+  cudaError_t status;
+  if (use_cooperative) {
+    if (smem > 46 * 1024) {
+      cudaFuncSetAttribute(cooperative_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+    }
+    status = cudaLaunchCooperativeKernel((void*)cooperative_kernel, nblks,
+                                         nthrs, args, smem, stream);
+  } else {
+    if (smem > 46 * 1024) {
+      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+    }
+    status = cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem, stream);
+  }
+  return status == cudaSuccess;
+}
+#define INST(T, d_out)                                                   \
+  template bool sgmv_shrink<T, d_out>(T * y, T * x, T * *w, int32_t * s_start, int32_t * s_end, \
+                                      void* tmp, uint32_t num_problems,  \
+                                      uint32_t d_in, uint32_t layer_idx, cudaStream_t stream);
+FOR_SGMV_NARROW(INST, nv_half);
+FOR_SGMV_NARROW(INST, nv_bfloat16);

sgmv_flashinfer/sgmv_config.h ADDED Viewed

	@@ -0,0 +1,17 @@

+#pragma once
+#include <cstdint>
+template <typename T, uint32_t d_out>
+bool sgmv_shrink(T* y, T* x, T** w, int32_t* s_start, int32_t* s_end, void* tmp,
+                 uint32_t num_problems, uint32_t d_in, uint32_t layer_idx, cudaStream_t stream);
+// clang-format off
+#define FOR_SGMV_NARROW(f, T) \
+    f(T, 16) \
+    f(T, 32) \
+    f(T, 64) \
+    f(T, 96) \
+    f(T, 128)
+// clang-format on

sgmv_flashinfer/sgmv_flashinfer.cuh ADDED Viewed

	@@ -0,0 +1,356 @@

+#pragma once
+#include <cooperative_groups.h>
+#include "flashinfer/cp_async.cuh"
+#include "flashinfer/mma.cuh"
+#include "flashinfer/permuted_smem.cuh"
+#include "flashinfer/vec_dtypes.cuh"
+namespace flashinfer {
+namespace sgmv {
+template <bool cooperative, typename T, typename IdType, uint32_t num_warps,
+          uint32_t d_out>
+__global__ void sgmv_shrink(T* y, T* x, T** w, IdType* s_starts, IdType* s_ends, float* tmp,
+                            uint32_t num_problems, uint32_t d_in,
+                            uint32_t layer_idx, uint32_t chunk_size) {
+  auto block = cooperative_groups::this_thread_block();
+  auto grid = cooperative_groups::this_grid();
+  constexpr auto fill_mode = cp_async::SharedMemFillMode::kFillZero;
+  const uint32_t problem_id = blockIdx.y;
+  const uint32_t bx = blockIdx.x;
+  constexpr uint32_t num_stages = 2;
+  constexpr uint32_t num_k_frags = 8;
+  constexpr uint32_t num_cells_k = (num_k_frags * 16) / cell_capacity<T>();
+  constexpr uint32_t num_blocks_n = d_out / 16;
+  const uint32_t num_chunks = gridDim.x;
+  const uint32_t chunk_start = chunk_size * bx;
+  const uint32_t num_iterations =
+      (chunk_size + (num_k_frags * 16 - 1)) / (num_k_frags * 16);
+  constexpr uint32_t num_cells_n =
+      (d_out < 32 ? 32 : d_out) / cell_capacity<T>();
+  const uint32_t tx = threadIdx.x, ty = threadIdx.y;
+  extern __shared__ uint8_t smem[];
+  smem_t x_smem[2]{smem, smem + sizeof(T) * num_warps * 16 * 16 * num_k_frags};
+  smem_t w_smem[2]{smem + sizeof(T) * 2 * num_warps * 16 * 16 * num_k_frags,
+                   smem + sizeof(T) * 16 * 16 * num_k_frags *
+                              (2 * num_warps + num_blocks_n)};
+  smem_t y_smem(smem);
+  uint32_t x_frag[num_k_frags][4];
+  uint32_t w_frag[num_k_frags][num_blocks_n][4];
+  float y_frag[num_blocks_n][8];
+  const uint32_t s_start = s_starts[problem_id], s_end = s_ends[problem_id];
+  const uint32_t num_steps = (s_start < s_end) ? (s_end - s_start + (num_warps * 16 - 1)) / (num_warps * 16) : 0;
+  for (uint32_t i = 0; i < num_steps; ++i) {
+    // init y_frag
+    if (bx == 0) {
+      if constexpr (num_blocks_n == 1) {
+        uint32_t row_idx = s_start + (i * num_warps + ty) * 16 + tx / 2;
+        T* y_ptr = y + row_idx * d_out + (tx % 2) * cell_capacity<T>();
+        auto offset =
+            smem_t::get_permuted_offset<num_cells_n>(ty * 16 + tx / 2, tx % 2);
+        y_smem.load_128b_async<fill_mode>(offset, y_ptr, row_idx < s_end);
+      } else {
+        uint32_t row_idx = s_start + (i * num_warps + ty) * 16 + tx / 4;
+        T* y_ptr = y + row_idx * d_out + (tx % 4) * cell_capacity<T>();
+        auto offset =
+            smem_t::get_permuted_offset<num_cells_n>(ty * 16 + tx / 4, tx % 4);
+#pragma unroll
+        for (uint32_t j = 0; j < 2; ++j) {
+#pragma unroll
+          for (uint32_t fno = 0; fno < num_blocks_n / 2; ++fno) {
+            y_smem.load_128b_async<fill_mode>(offset, y_ptr, row_idx < s_end);
+            y_ptr += 4 * cell_capacity<T>();
+            offset += 8;
+          }
+          row_idx += 8;
+          y_ptr += 8 * d_out - 2 * num_blocks_n * cell_capacity<T>();
+          offset += 8 * num_cells_n - 4 * num_blocks_n;
+        }
+      }
+      cp_async::commit_group();
+      cp_async::wait_group<0>();
+      block.sync();
+      auto offset =
+          smem_t::get_permuted_offset<num_cells_n>(ty * 16 + tx % 16, tx / 16);
+#pragma unroll
+      for (uint32_t fn = 0; fn < num_blocks_n; ++fn) {
+        uint32_t tmp[4];
+        y_smem.ldmatrix_m8n8x4(offset, tmp);
+        vec_cast<float, T, 8>(y_frag[fn], (T*)tmp);
+        offset = (offset ^ 0x2) + (fn & 0x1) * 8;
+      }
+    } else {
+#pragma unroll
+      for (uint32_t fn = 0; fn < num_blocks_n; ++fn) {
+#pragma unroll
+        for (uint32_t reg_id = 0; reg_id < 8; ++reg_id) {
+          y_frag[fn][reg_id] = 0.f;
+        }
+      }
+    }
+    // preload x_smem, w_smem
+#pragma unroll
+    for (uint32_t iter = 0; iter < num_stages; ++iter) {
+      uint32_t row_idx = s_start + (i * num_warps + ty) * 16 + tx / 4;
+      T* x_ptr = x + row_idx * d_in + chunk_start +
+                 (2 * num_k_frags * iter + tx % 4) * cell_capacity<T>();
+      T* x_ptr_max = x + row_idx * d_in + min(d_in, chunk_start + chunk_size);
+      auto offset =
+          smem_t::get_permuted_offset<num_cells_k>(ty * 16 + tx / 4, tx % 4);
+      // pre-load x_smem, w_smem
+#pragma unroll
+      for (uint32_t j = 0; j < 2; ++j) {
+#pragma unroll
+        for (uint32_t fko = 0; fko < num_k_frags / 2; ++fko) {
+          x_smem[iter].load_128b_async<fill_mode>(
+              offset, x_ptr, row_idx < s_end && x_ptr < x_ptr_max);
+          x_ptr += 4 * cell_capacity<T>();
+          offset += 8;
+        }
+        row_idx += 8;
+        x_ptr += 8 * d_in - 2 * cell_capacity<T>() * num_k_frags;
+        x_ptr_max += 8 * d_in;
+        offset += 8 * num_cells_k - 4 * num_k_frags;
+      }
+      row_idx -= 8;
+      static_assert(num_k_frags % (num_warps * 2) == 0);
+      constexpr uint32_t num_fko_iters_per_warp = num_k_frags / (num_warps * 2);
+#pragma unroll
+      for (uint32_t fn = 0; fn < num_blocks_n; ++fn) {
+        T* w_ptr = w[problem_id] + layer_idx * d_in * d_out +
+                   (fn * 16 + tx / 4) * d_in + chunk_start +
+                   (2 * num_k_frags * iter + ty * num_fko_iters_per_warp * 4 +
+                    tx % 4) *
+                       cell_capacity<T>();
+        T* w_ptr_max =
+            w[problem_id] + layer_idx * d_in * d_out +
+            min((fn * 16 + tx / 4 + 1) * d_in,
+                (fn * 16 + tx / 4) * d_in + chunk_start + chunk_size);
+        auto offset = smem_t::get_permuted_offset<num_cells_k>(
+            fn * 16 + tx / 4, ty * num_fko_iters_per_warp * 4 + tx % 4);
+#pragma unroll
+        for (uint32_t j = 0; j < 2; ++j) {
+#pragma unroll
+          for (uint32_t fko = 0; fko < num_fko_iters_per_warp; ++fko) {
+            w_smem[iter].load_128b_async<fill_mode>(offset, w_ptr,
+                                                    w_ptr < w_ptr_max);
+            w_ptr += 4 * cell_capacity<T>();
+            offset += 8;
+          }
+          w_ptr += 8 * d_in - 4 * cell_capacity<T>() * num_fko_iters_per_warp;
+          w_ptr_max += 8 * d_in;
+          offset += 8 * num_cells_k - 8 * num_fko_iters_per_warp;
+        }
+      }
+      cp_async::commit_group();
+    }
+#pragma unroll 1
+    for (uint32_t iter = 0; iter < num_iterations; ++iter) {
+      const uint32_t stage_idx = iter % 2;
+      cp_async::wait_group<1>();
+      block.sync();
+      auto offset =
+          smem_t::get_permuted_offset<num_cells_k>(ty * 16 + tx % 16, tx / 16);
+#pragma unroll
+      for (uint32_t fk = 0; fk < num_k_frags; ++fk) {
+        x_smem[stage_idx].ldmatrix_m8n8x4(offset, x_frag[fk]);
+        offset = (offset ^ 0x2) + (fk & 0x1) * 8;
+      }
+#pragma unroll
+      for (uint32_t fn = 0; fn < num_blocks_n; ++fn) {
+        auto offset = smem_t::get_permuted_offset<num_cells_k>(
+            fn * 16 + 8 * (tx / 16) + tx % 8, (tx % 16) / 8);
+#pragma unroll
+        for (uint32_t fk = 0; fk < num_k_frags; ++fk) {
+          w_smem[stage_idx].ldmatrix_m8n8x4(offset, w_frag[fk][fn]);
+          offset = (offset ^ 0x2) + (fk & 0x1) * 8;
+        }
+        offset += 16 * num_cells_k - 4 * num_k_frags;
+      }
+      // compute y_frag
+#pragma unroll
+      for (uint32_t fk = 0; fk < num_k_frags; ++fk) {
+#pragma unroll
+        for (uint32_t fn = 0; fn < num_blocks_n; ++fn) {
+          mma::mma_sync_m16n16k16_row_col_f16f16f32<T>(y_frag[fn], x_frag[fk],
+                                                       w_frag[fk][fn]);
+        }
+      }
+      block.sync();
+      // load next stage
+      if (iter + num_stages < num_iterations) {
+        uint32_t row_idx = s_start + (i * num_warps + ty) * 16 + tx / 4;
+        T* x_ptr = x + row_idx * d_in + chunk_start +
+                   (2 * num_k_frags * (iter + num_stages) + tx % 4) *
+                       cell_capacity<T>();
+        T* x_ptr_max = x + row_idx * d_in + min(d_in, chunk_start + chunk_size);
+        auto offset =
+            smem_t::get_permuted_offset<num_cells_k>(ty * 16 + tx / 4, tx % 4);
+        // pre-load x_smem, w_smem
+#pragma unroll
+        for (uint32_t j = 0; j < 2; ++j) {
+#pragma unroll
+          for (uint32_t fko = 0; fko < num_k_frags / 2; ++fko) {
+            x_smem[stage_idx].load_128b_async<fill_mode>(
+                offset, x_ptr, row_idx < s_end && x_ptr < x_ptr_max);
+            x_ptr += 4 * cell_capacity<T>();
+            offset += 8;
+          }
+          row_idx += 8;
+          x_ptr += 8 * d_in - 2 * cell_capacity<T>() * num_k_frags;
+          x_ptr_max += 8 * d_in;
+          offset += 8 * num_cells_k - 4 * num_k_frags;
+        }
+        row_idx -= 8;
+        constexpr uint32_t num_fko_iters_per_warp =
+            num_k_frags / (num_warps * 2);
+#pragma unroll
+        for (uint32_t fn = 0; fn < num_blocks_n; ++fn) {
+          T* w_ptr = w[problem_id] + layer_idx * d_in * d_out +
+                     (fn * 16 + tx / 4) * d_in + chunk_start +
+                     (2 * num_k_frags * (iter + num_stages) +
+                      ty * num_fko_iters_per_warp * 4 + tx % 4) *
+                         cell_capacity<T>();
+          T* w_ptr_max =
+              w[problem_id] + layer_idx * d_in * d_out +
+              min((fn * 16 + tx / 4 + 1) * d_in,
+                  (fn * 16 + tx / 4) * d_in + chunk_start + chunk_size);
+          auto offset = smem_t::get_permuted_offset<num_cells_k>(
+              fn * 16 + tx / 4, ty * num_fko_iters_per_warp * 4 + tx % 4);
+#pragma unroll
+          for (uint32_t j = 0; j < 2; ++j) {
+#pragma unroll
+            for (uint32_t fko = 0; fko < num_fko_iters_per_warp; ++fko) {
+              w_smem[stage_idx].load_128b_async<fill_mode>(offset, w_ptr,
+                                                           w_ptr < w_ptr_max);
+              w_ptr += 4 * cell_capacity<T>();
+              offset += 8;
+            }
+            w_ptr += 8 * d_in - 4 * cell_capacity<T>() * num_fko_iters_per_warp;
+            w_ptr_max += 8 * d_in;
+            offset += 8 * num_cells_k - 8 * num_fko_iters_per_warp;
+          }
+        }
+      }
+      cp_async::commit_group();
+    }
+    cp_async::wait_group<0>();
+    block.sync();
+    if constexpr (cooperative) {
+#pragma unroll
+      for (uint32_t fn = 0; fn < num_blocks_n; ++fn) {
+        vec_t<float, 8>::memcpy(
+            tmp + (fn * grid.size() +
+                   (problem_id * num_chunks + bx) * block.num_threads() +
+                   block.thread_rank()) *
+                      8,
+            y_frag[fn]);
+      }
+      grid.sync();
+#pragma unroll
+      for (uint32_t fn = 0; fn < num_blocks_n; ++fn) {
+#pragma unroll
+        for (uint32_t reg_id = 0; reg_id < 8; ++reg_id) {
+          y_frag[fn][reg_id] = 0.f;
+        }
+        for (uint32_t chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+          vec_t<float, 8> y_other;
+          y_other.load(tmp + (fn * grid.size() +
+                              (problem_id * num_chunks + chunk_idx) *
+                                  block.num_threads() +
+                              block.thread_rank()) *
+                                 8);
+#pragma unroll
+          for (uint32_t reg_id = 0; reg_id < 8; ++reg_id) {
+            y_frag[fn][reg_id] += y_other[reg_id];
+          }
+        }
+      }
+    }
+    if (bx == 0) {
+      // store y_frag
+      auto offset =
+          smem_t::get_permuted_offset<num_cells_n>(ty * 16 + tx / 4, 0);
+#pragma unroll
+      for (uint32_t fn = 0; fn < num_blocks_n; ++fn) {
+        vec_cast<T, float, 2>((T*)(y_smem.base + offset) + (tx % 4) * 2,
+                              &y_frag[fn][0]);
+        vec_cast<T, float, 2>(
+            (T*)(y_smem.base + offset + 8 * num_cells_n) + (tx % 4) * 2,
+            &y_frag[fn][2]);
+        vec_cast<T, float, 2>((T*)(y_smem.base + (offset ^ 0x1)) + (tx % 4) * 2,
+                              &y_frag[fn][4]);
+        vec_cast<T, float, 2>(
+            (T*)(y_smem.base + (offset ^ 0x1) + 8 * num_cells_n) + (tx % 4) * 2,
+            &y_frag[fn][6]);
+        offset = (offset ^ 0x2) + (fn & 0x1) * 8;
+      }
+      // store y
+      if constexpr (num_blocks_n == 1) {
+        uint32_t row_idx = s_start + (i * num_warps + ty) * 16 + tx / 2;
+        T* y_ptr = y + row_idx * d_out + (tx % 2) * cell_capacity<T>();
+        auto offset =
+            smem_t::get_permuted_offset<num_cells_n>(ty * 16 + tx / 2, tx % 2);
+        if (row_idx < s_end) {
+          y_smem.store_128b(offset, y_ptr);
+        }
+      } else {
+        uint32_t row_idx = s_start + (i * num_warps + ty) * 16 + tx / 4;
+        T* y_ptr = y + row_idx * d_out + (tx % 4) * cell_capacity<T>();
+        auto offset =
+            smem_t::get_permuted_offset<num_cells_n>(ty * 16 + tx / 4, tx % 4);
+#pragma unroll
+        for (uint32_t j = 0; j < 2; ++j) {
+#pragma unroll
+          for (uint32_t fno = 0; fno < num_blocks_n / 2; ++fno) {
+            if (row_idx < s_end) {
+              y_smem.store_128b(offset, y_ptr);
+            }
+            y_ptr += 4 * cell_capacity<T>();
+            offset += 8;
+          }
+          row_idx += 8;
+          y_ptr += 8 * d_out - 2 * num_blocks_n * cell_capacity<T>();
+          offset += 8 * num_cells_n - 4 * num_blocks_n;
+        }
+      }
+    }
+  }
+  // handle the case where one of the segments needs more steps than this one
+  // to avoid deadlock
+  if constexpr (cooperative) {
+    uint32_t max_segment_size = 0;
+    for (uint32_t i = 0; i < num_problems; ++i) {
+      max_segment_size = max(max_segment_size, s_ends[i] - s_starts[i]);
+    }
+    const uint32_t max_steps = (max_segment_size + (num_warps * 16 - 1)) / (num_warps * 16);
+    for (uint32_t i = 0; i < max_steps - num_steps; ++i) {
+      grid.sync();
+    }
+  }
+}
+}  // namespace sgmv
+}  // namespace flashinfer

tests/test_sgmv.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from typing import List, Tuple
+import pytest
+import torch
+from punica_sgmv import (
+    get_tmp_tensors,
+    lora_a_sgmv_cutlass,
+    lora_b_sgmv_cutlass,
+    pad_rank,
+    use_cutlass_shrink,
+)
+def lora_ref_impl(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa: List[torch.Tensor],
+    wb: List[torch.Tensor],
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    for i in range(len(wa)):
+        if s_end[i] - s_start[i] <= 0:
+            continue
+        xi = x[s_start[i]:s_end[i]]
+        wai = wa[i][layer_idx, :, :]
+        wbi = wb[i][layer_idx, :, :]
+        if not use_cutlass_shrink(lora_rank):
+            wai = wai.t()
+        yi = y[s_start[i]:s_end[i]]
+        tmp = (xi @ wai)
+        y[s_start[i]:s_end[i]] = (yi + tmp @ wbi)
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.parametrize("segments", [
+    ([0, 2], [1, 3]),
+    ([0, -1], [1, -1]),
+])
+@pytest.mark.parametrize("lora_rank", [8, 16, 32, 64, 128])
+def test_add_lora_sgmv(lora_rank: int, segments: Tuple[List[int], List[int]]):
+    torch.manual_seed(42)
+    B = 3
+    H = 1024
+    r = lora_rank
+    nlayers = 2
+    device = torch.device("cuda:0")
+    y = torch.zeros((B, H), dtype=torch.float16, device=device)
+    x = torch.randn((B, H), dtype=torch.float16, device=device)
+    wa = torch.randn(nlayers, r, H, dtype=torch.float16, device=device)
+    if use_cutlass_shrink(r):
+        # cutlass uses (H, r) layout
+        wa = wa.transpose(1, 2).contiguous()
+    # TODO(travis): transpose (r, H) -> (H, r) when not using cutlass
+    wb = torch.randn(nlayers, r, H, dtype=torch.float16, device=device)
+    s1, s2 = segments
+    s_start = torch.tensor(s1, dtype=torch.int32, device=device)
+    s_end = torch.tensor(s2, dtype=torch.int32, device=device)
+    wa_list = [wa if y - x > 0 else None for x, y in zip(s1, s2)]
+    wb_list = [wb if y - x > 0 else None for x, y in zip(s1, s2)]
+    wa_ptr = torch.tensor([wa.data_ptr() if wa is not None else 0 for wa in wa_list], dtype=torch.int64, device=device)
+    wb_ptr = torch.tensor([wb.data_ptr() if wb is not None else 0 for wb in wb_list], dtype=torch.int64, device=device)
+    layer_idx = 0
+    y_ref = y.clone()
+    lora_ref_impl(y_ref, x, wa_list, wb_list, s_start, s_end, layer_idx, r)
+    tmp_shrink, tmp_expand = get_tmp_tensors(wa_ptr.size(0), r, x.device)
+    y_ours = torch.zeros((B, H), dtype=torch.float16, device=device)
+    v = lora_a_sgmv_cutlass(x, tmp_shrink, wa_ptr, s_start, s_end, layer_idx, r)
+    lora_b_sgmv_cutlass(y_ours, v, tmp_expand, wb_ptr, s_start, s_end, layer_idx)
+    assert torch.allclose(y_ref, y_ours, rtol=1e-2, atol=1e-3)
+    # graph trace
+    tmp_shrink, tmp_expand = get_tmp_tensors(wa_ptr.size(0), r, x.device)
+    y_ours_graph = torch.zeros((B, H), dtype=torch.float16, device=device)
+    torch.cuda.synchronize(device)
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph, pool=None):
+        v = lora_a_sgmv_cutlass(x, tmp_shrink, wa_ptr, s_start, s_end, layer_idx, r)
+        lora_b_sgmv_cutlass(y_ours_graph, v, tmp_expand, wb_ptr, s_start, s_end, layer_idx)
+    torch.cuda.synchronize(device)
+    graph.replay()
+    assert torch.allclose(y_ours, y_ours_graph, rtol=1e-2, atol=1e-3)
+@pytest.mark.parametrize("world_size", [1, 2, 4, 8])
+@pytest.mark.parametrize("lora_rank", [8, 16, 32, 64, 128])
+def test_pad_rank(lora_rank: int, world_size: int):
+    bs = 8
+    h = 1024
+    x = torch.randn((bs, h), dtype=torch.float16)
+    lora_a = torch.randn((h, lora_rank), dtype=torch.float16)
+    lora_b = torch.randn((lora_rank, h), dtype=torch.float16)
+    lora_a_padded = pad_rank(lora_a, dim=1, world_size=world_size)
+    lora_b_padded = pad_rank(lora_b, dim=0, world_size=world_size)
+    assert lora_a_padded.size(1) == lora_b_padded.size(0)
+    assert lora_a_padded.size(1) >= lora_a.size(1)
+    assert lora_b_padded.size(0) >= lora_b.size(0)
+    expected = x @ lora_a @ lora_b
+    actual = x @ lora_a_padded @ lora_b_padded
+    assert torch.allclose(expected, actual)

torch-ext/punica_sgmv/__init__.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from typing import Optional, Tuple
+from functools import lru_cache
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+MIN_SGMV_RANK = 8
+MIN_RANK_CUSTOM = 16
+MAX_RANK_CUSTOM = 128
+SGMV_BLOCK_SIZE = 16
+BGMV_MAX_RANK = 128
+def orient_for_rank(t: torch.Tensor, rank: int) -> torch.Tensor:
+    if MIN_RANK_CUSTOM <= rank <= MAX_RANK_CUSTOM:
+        return t.transpose(0, 1)
+    return t
+def add_lora_sgmv_cutlass(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.Tensor,
+    s_end: torch.Tensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    """
+    Semantics:
+        y[s[i]:s[i+1]] += x[s[i]:s[i+1]] @ deref(wa_ptr[i]).T @ deref(wb_ptr[i])
+    Args:
+        y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+        x: Shape: `[B, H1]`. Input vectors.
+        wa_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H1]`.
+        wb_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H2]`.
+        s_start: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices start indices.
+        s_end: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices end indices.
+        layer_idx: Layer index of the weight matrices.
+    """
+    if lora_rank < MIN_RANK_CUSTOM or lora_rank > MAX_RANK_CUSTOM:
+        # Custom SGMV shrink only supports rank 16, 32, 64, 128
+        _add_lora_sgmv_cutlass_legacy(y, x, wa_ptr, wb_ptr, s_start, s_end, layer_idx, lora_rank)
+        return
+    tmp1 = torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=x.device)
+    tmp2_size = ops.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp2 = torch.empty((tmp2_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    ops.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp1, layer_idx)
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp2, layer_idx)
+def _add_lora_sgmv_cutlass_legacy(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    tmp_size = ops.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp = torch.empty((tmp_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    ops.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+def lora_a_sgmv_cutlass(
+    x: torch.Tensor,
+    tmp: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+) -> torch.Tensor:
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    if MIN_RANK_CUSTOM <= lora_rank <= MAX_RANK_CUSTOM:
+        ops.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    else:
+        ops.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    return v
+def lora_b_sgmv_cutlass(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    tmp: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+):
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+def add_lora_a_bgmv(
+    v: torch.Tensor,
+    x: torch.Tensor,
+    wa_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    ops.dispatch_bgmv(v, x, wa_T_all, indicies, layer_idx, 1.0)
+def add_lora_b_bgmv(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    wb_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    ops.dispatch_bgmv(y, v, wb_T_all, indicies, layer_idx, 1.0)
+def pad_rank(t: torch.Tensor, dim: int, world_size: int) -> torch.Tensor:
+    """Pad a tensor to the minimum rank for SGMV and the nearest multiple of the SGMV block size."""
+    # tensor parallelism will result in effective rank being divided by world_size,
+    # so we need to scale the min rank to offset that effect
+    min_rank = MIN_SGMV_RANK * world_size
+    return pad_to_min_rank(t, dim, min_rank)
+def pad_to_min_rank(t: torch.Tensor, dim: int, min_rank: int) -> torch.Tensor:
+    # if we're at or below the min rank, pad up to the min rank
+    # otherwise, pad to the nearest multiple of the block size
+    current_rank = t.size(dim)
+    target_rank = (
+        min_rank
+        if current_rank <= min_rank
+        else (current_rank + SGMV_BLOCK_SIZE - 1) // SGMV_BLOCK_SIZE * SGMV_BLOCK_SIZE
+    )
+    if current_rank == target_rank:
+        return t
+    pad_size = target_rank - current_rank
+    # see complicatd pad syntax here: https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
+    pad = [0, 0] * t.dim()
+    pad[(t.dim() - dim - 1) * 2 + 1] = pad_size
+    pad = tuple(pad)
+    return F.pad(t, pad, mode="constant", value=0.0)
+def use_cutlass_shrink(lora_rank: int) -> bool:
+    return lora_rank < MIN_RANK_CUSTOM
+@lru_cache(maxsize=1)
+def get_tmp_tensor(device: torch.device) -> torch.Tensor:
+    return torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=device)
+@lru_cache(maxsize=32)
+def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Tensor:
+    tmp_size = ops.sgmv_cutlass_tmp_size(size)
+    return torch.empty((tmp_size,), dtype=torch.uint8, device=device)
+def get_tmp_expand_size(size: int) -> int:
+    return ops.sgmv_cutlass_tmp_size(size)
+def get_tmp_tensors(nsegments: int, lora_rank: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+    if use_cutlass_shrink(lora_rank):
+        tmp = get_tmp_tensor_for_size(nsegments, device)
+        return tmp, tmp
+    else:
+        tmp_shrink = get_tmp_tensor(device)
+        tmp_expand = get_tmp_tensor_for_size(nsegments, device)
+        return tmp_shrink, tmp_expand

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,23 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("sgmv_shrink(Tensor! y, Tensor x, Tensor w_ptr, Tensor s_start, "
+                      "Tensor s_end, Tensor! tmp, int layer_idx) -> ()");
+  ops.impl("sgmv_shrink", torch::kCUDA, &dispatch_sgmv_shrink);
+  ops.def("sgmv_cutlass(Tensor! y, Tensor x, Tensor w_ptr, Tensor s_start, "
+                       "Tensor s_end, Tensor! tmp, int layer_idx) -> ()");
+  ops.impl("sgmv_cutlass", torch::kCUDA, &dispatch_sgmv_cutlass);
+  ops.def("sgmv_cutlass_tmp_size(int num_problems) -> int");
+  ops.impl("sgmv_cutlass_tmp_size", &sgmv_tmp_size);
+  ops.def("dispatch_bgmv(Tensor! y, Tensor x, Tensor w_ptr, Tensor indices, "
+                        "int layer_indices, float scale) -> ()");
+  ops.impl("dispatch_bgmv", torch::kCUDA, &dispatch_bgmv);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,16 @@

+#pragma once
+#include <torch/torch.h>
+void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w_ptr,
+                   torch::Tensor indicies, int64_t layer_idx, double scale);
+void dispatch_sgmv_cutlass(torch::Tensor y, torch::Tensor x, torch::Tensor w_ptr,
+                           torch::Tensor s_start, torch::Tensor s_end,
+                           torch::Tensor tmp, int64_t layer_idx);
+void dispatch_sgmv_shrink(torch::Tensor y, torch::Tensor x, torch::Tensor w_ptr,
+                          torch::Tensor s_start, torch::Tensor s_end, torch::Tensor tmp, int64_t layer_idx);
+int64_t sgmv_tmp_size(int64_t num_problems);