Build (aarch64)

Browse files

Files changed (12) hide show

build/torch26-cxx11-cu126-aarch64-linux/punica_sgmv/__init__.py +172 -0
build/torch26-cxx11-cu126-aarch64-linux/punica_sgmv/_ops.py +9 -0
build/torch26-cxx11-cu126-aarch64-linux/punica_sgmv/_punica_sgmv_ad0ac7e_dirty.abi3.so +3 -0
build/torch26-cxx98-cu126-aarch64-linux/punica_sgmv/__init__.py +172 -0
build/torch26-cxx98-cu126-aarch64-linux/punica_sgmv/_ops.py +9 -0
build/torch26-cxx98-cu126-aarch64-linux/punica_sgmv/_punica_sgmv_ad0ac7e_dirty.abi3.so +3 -0
build/torch27-cxx11-cu126-aarch64-linux/punica_sgmv/__init__.py +172 -0
build/torch27-cxx11-cu126-aarch64-linux/punica_sgmv/_ops.py +9 -0
build/torch27-cxx11-cu126-aarch64-linux/punica_sgmv/_punica_sgmv_ad0ac7e_dirty.abi3.so +3 -0
build/torch27-cxx11-cu128-aarch64-linux/punica_sgmv/__init__.py +172 -0
build/torch27-cxx11-cu128-aarch64-linux/punica_sgmv/_ops.py +9 -0
build/torch27-cxx11-cu128-aarch64-linux/punica_sgmv/_punica_sgmv_ad0ac7e_dirty.abi3.so +3 -0

build/torch26-cxx11-cu126-aarch64-linux/punica_sgmv/__init__.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from typing import Optional, Tuple
+from functools import lru_cache
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+MIN_SGMV_RANK = 8
+MIN_RANK_CUSTOM = 16
+MAX_RANK_CUSTOM = 128
+SGMV_BLOCK_SIZE = 16
+BGMV_MAX_RANK = 128
+def orient_for_rank(t: torch.Tensor, rank: int) -> torch.Tensor:
+    if MIN_RANK_CUSTOM <= rank <= MAX_RANK_CUSTOM:
+        return t.transpose(0, 1)
+    return t
+def add_lora_sgmv_cutlass(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.Tensor,
+    s_end: torch.Tensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    """
+    Semantics:
+        y[s[i]:s[i+1]] += x[s[i]:s[i+1]] @ deref(wa_ptr[i]).T @ deref(wb_ptr[i])
+    Args:
+        y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+        x: Shape: `[B, H1]`. Input vectors.
+        wa_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H1]`.
+        wb_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H2]`.
+        s_start: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices start indices.
+        s_end: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices end indices.
+        layer_idx: Layer index of the weight matrices.
+    """
+    if lora_rank < MIN_RANK_CUSTOM or lora_rank > MAX_RANK_CUSTOM:
+        # Custom SGMV shrink only supports rank 16, 32, 64, 128
+        _add_lora_sgmv_cutlass_legacy(y, x, wa_ptr, wb_ptr, s_start, s_end, layer_idx, lora_rank)
+        return
+    tmp1 = torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=x.device)
+    tmp2_size = ops.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp2 = torch.empty((tmp2_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    ops.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp1, layer_idx)
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp2, layer_idx)
+def _add_lora_sgmv_cutlass_legacy(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    tmp_size = ops.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp = torch.empty((tmp_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    ops.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+def lora_a_sgmv_cutlass(
+    x: torch.Tensor,
+    tmp: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+) -> torch.Tensor:
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    if MIN_RANK_CUSTOM <= lora_rank <= MAX_RANK_CUSTOM:
+        ops.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    else:
+        ops.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    return v
+def lora_b_sgmv_cutlass(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    tmp: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+):
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+def add_lora_a_bgmv(
+    v: torch.Tensor,
+    x: torch.Tensor,
+    wa_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    ops.dispatch_bgmv(v, x, wa_T_all, indicies, layer_idx, 1.0)
+def add_lora_b_bgmv(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    wb_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    ops.dispatch_bgmv(y, v, wb_T_all, indicies, layer_idx, 1.0)
+def pad_rank(t: torch.Tensor, dim: int, world_size: int) -> torch.Tensor:
+    """Pad a tensor to the minimum rank for SGMV and the nearest multiple of the SGMV block size."""
+    # tensor parallelism will result in effective rank being divided by world_size,
+    # so we need to scale the min rank to offset that effect
+    min_rank = MIN_SGMV_RANK * world_size
+    return pad_to_min_rank(t, dim, min_rank)
+def pad_to_min_rank(t: torch.Tensor, dim: int, min_rank: int) -> torch.Tensor:
+    # if we're at or below the min rank, pad up to the min rank
+    # otherwise, pad to the nearest multiple of the block size
+    current_rank = t.size(dim)
+    target_rank = (
+        min_rank
+        if current_rank <= min_rank
+        else (current_rank + SGMV_BLOCK_SIZE - 1) // SGMV_BLOCK_SIZE * SGMV_BLOCK_SIZE
+    )
+    if current_rank == target_rank:
+        return t
+    pad_size = target_rank - current_rank
+    # see complicatd pad syntax here: https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
+    pad = [0, 0] * t.dim()
+    pad[(t.dim() - dim - 1) * 2 + 1] = pad_size
+    pad = tuple(pad)
+    return F.pad(t, pad, mode="constant", value=0.0)
+def use_cutlass_shrink(lora_rank: int) -> bool:
+    return lora_rank < MIN_RANK_CUSTOM
+@lru_cache(maxsize=1)
+def get_tmp_tensor(device: torch.device) -> torch.Tensor:
+    return torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=device)
+@lru_cache(maxsize=32)
+def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Tensor:
+    tmp_size = ops.sgmv_cutlass_tmp_size(size)
+    return torch.empty((tmp_size,), dtype=torch.uint8, device=device)
+def get_tmp_expand_size(size: int) -> int:
+    return ops.sgmv_cutlass_tmp_size(size)
+def get_tmp_tensors(nsegments: int, lora_rank: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+    if use_cutlass_shrink(lora_rank):
+        tmp = get_tmp_tensor_for_size(nsegments, device)
+        return tmp, tmp
+    else:
+        tmp_shrink = get_tmp_tensor(device)
+        tmp_expand = get_tmp_tensor_for_size(nsegments, device)
+        return tmp_shrink, tmp_expand

build/torch26-cxx11-cu126-aarch64-linux/punica_sgmv/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _punica_sgmv_ad0ac7e_dirty
+ops = torch.ops._punica_sgmv_ad0ac7e_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_punica_sgmv_ad0ac7e_dirty::{op_name}"

build/torch26-cxx11-cu126-aarch64-linux/punica_sgmv/_punica_sgmv_ad0ac7e_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fb84288c2a868d46ec95e015ef56c1c661b46f9c8158dde3809569b973062af
+size 14311192

build/torch26-cxx98-cu126-aarch64-linux/punica_sgmv/__init__.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from typing import Optional, Tuple
+from functools import lru_cache
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+MIN_SGMV_RANK = 8
+MIN_RANK_CUSTOM = 16
+MAX_RANK_CUSTOM = 128
+SGMV_BLOCK_SIZE = 16
+BGMV_MAX_RANK = 128
+def orient_for_rank(t: torch.Tensor, rank: int) -> torch.Tensor:
+    if MIN_RANK_CUSTOM <= rank <= MAX_RANK_CUSTOM:
+        return t.transpose(0, 1)
+    return t
+def add_lora_sgmv_cutlass(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.Tensor,
+    s_end: torch.Tensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    """
+    Semantics:
+        y[s[i]:s[i+1]] += x[s[i]:s[i+1]] @ deref(wa_ptr[i]).T @ deref(wb_ptr[i])
+    Args:
+        y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+        x: Shape: `[B, H1]`. Input vectors.
+        wa_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H1]`.
+        wb_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H2]`.
+        s_start: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices start indices.
+        s_end: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices end indices.
+        layer_idx: Layer index of the weight matrices.
+    """
+    if lora_rank < MIN_RANK_CUSTOM or lora_rank > MAX_RANK_CUSTOM:
+        # Custom SGMV shrink only supports rank 16, 32, 64, 128
+        _add_lora_sgmv_cutlass_legacy(y, x, wa_ptr, wb_ptr, s_start, s_end, layer_idx, lora_rank)
+        return
+    tmp1 = torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=x.device)
+    tmp2_size = ops.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp2 = torch.empty((tmp2_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    ops.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp1, layer_idx)
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp2, layer_idx)
+def _add_lora_sgmv_cutlass_legacy(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    tmp_size = ops.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp = torch.empty((tmp_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    ops.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+def lora_a_sgmv_cutlass(
+    x: torch.Tensor,
+    tmp: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+) -> torch.Tensor:
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    if MIN_RANK_CUSTOM <= lora_rank <= MAX_RANK_CUSTOM:
+        ops.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    else:
+        ops.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    return v
+def lora_b_sgmv_cutlass(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    tmp: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+):
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+def add_lora_a_bgmv(
+    v: torch.Tensor,
+    x: torch.Tensor,
+    wa_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    ops.dispatch_bgmv(v, x, wa_T_all, indicies, layer_idx, 1.0)
+def add_lora_b_bgmv(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    wb_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    ops.dispatch_bgmv(y, v, wb_T_all, indicies, layer_idx, 1.0)
+def pad_rank(t: torch.Tensor, dim: int, world_size: int) -> torch.Tensor:
+    """Pad a tensor to the minimum rank for SGMV and the nearest multiple of the SGMV block size."""
+    # tensor parallelism will result in effective rank being divided by world_size,
+    # so we need to scale the min rank to offset that effect
+    min_rank = MIN_SGMV_RANK * world_size
+    return pad_to_min_rank(t, dim, min_rank)
+def pad_to_min_rank(t: torch.Tensor, dim: int, min_rank: int) -> torch.Tensor:
+    # if we're at or below the min rank, pad up to the min rank
+    # otherwise, pad to the nearest multiple of the block size
+    current_rank = t.size(dim)
+    target_rank = (
+        min_rank
+        if current_rank <= min_rank
+        else (current_rank + SGMV_BLOCK_SIZE - 1) // SGMV_BLOCK_SIZE * SGMV_BLOCK_SIZE
+    )
+    if current_rank == target_rank:
+        return t
+    pad_size = target_rank - current_rank
+    # see complicatd pad syntax here: https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
+    pad = [0, 0] * t.dim()
+    pad[(t.dim() - dim - 1) * 2 + 1] = pad_size
+    pad = tuple(pad)
+    return F.pad(t, pad, mode="constant", value=0.0)
+def use_cutlass_shrink(lora_rank: int) -> bool:
+    return lora_rank < MIN_RANK_CUSTOM
+@lru_cache(maxsize=1)
+def get_tmp_tensor(device: torch.device) -> torch.Tensor:
+    return torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=device)
+@lru_cache(maxsize=32)
+def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Tensor:
+    tmp_size = ops.sgmv_cutlass_tmp_size(size)
+    return torch.empty((tmp_size,), dtype=torch.uint8, device=device)
+def get_tmp_expand_size(size: int) -> int:
+    return ops.sgmv_cutlass_tmp_size(size)
+def get_tmp_tensors(nsegments: int, lora_rank: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+    if use_cutlass_shrink(lora_rank):
+        tmp = get_tmp_tensor_for_size(nsegments, device)
+        return tmp, tmp
+    else:
+        tmp_shrink = get_tmp_tensor(device)
+        tmp_expand = get_tmp_tensor_for_size(nsegments, device)
+        return tmp_shrink, tmp_expand

build/torch26-cxx98-cu126-aarch64-linux/punica_sgmv/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _punica_sgmv_ad0ac7e_dirty
+ops = torch.ops._punica_sgmv_ad0ac7e_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_punica_sgmv_ad0ac7e_dirty::{op_name}"

build/torch26-cxx98-cu126-aarch64-linux/punica_sgmv/_punica_sgmv_ad0ac7e_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:530a51beb6f591c58e8fe13afd427204bbf39572648fbc2befd5d5d63358b4cb
+size 14307968

build/torch27-cxx11-cu126-aarch64-linux/punica_sgmv/__init__.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from typing import Optional, Tuple
+from functools import lru_cache
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+MIN_SGMV_RANK = 8
+MIN_RANK_CUSTOM = 16
+MAX_RANK_CUSTOM = 128
+SGMV_BLOCK_SIZE = 16
+BGMV_MAX_RANK = 128
+def orient_for_rank(t: torch.Tensor, rank: int) -> torch.Tensor:
+    if MIN_RANK_CUSTOM <= rank <= MAX_RANK_CUSTOM:
+        return t.transpose(0, 1)
+    return t
+def add_lora_sgmv_cutlass(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.Tensor,
+    s_end: torch.Tensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    """
+    Semantics:
+        y[s[i]:s[i+1]] += x[s[i]:s[i+1]] @ deref(wa_ptr[i]).T @ deref(wb_ptr[i])
+    Args:
+        y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+        x: Shape: `[B, H1]`. Input vectors.
+        wa_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H1]`.
+        wb_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H2]`.
+        s_start: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices start indices.
+        s_end: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices end indices.
+        layer_idx: Layer index of the weight matrices.
+    """
+    if lora_rank < MIN_RANK_CUSTOM or lora_rank > MAX_RANK_CUSTOM:
+        # Custom SGMV shrink only supports rank 16, 32, 64, 128
+        _add_lora_sgmv_cutlass_legacy(y, x, wa_ptr, wb_ptr, s_start, s_end, layer_idx, lora_rank)
+        return
+    tmp1 = torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=x.device)
+    tmp2_size = ops.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp2 = torch.empty((tmp2_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    ops.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp1, layer_idx)
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp2, layer_idx)
+def _add_lora_sgmv_cutlass_legacy(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    tmp_size = ops.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp = torch.empty((tmp_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    ops.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+def lora_a_sgmv_cutlass(
+    x: torch.Tensor,
+    tmp: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+) -> torch.Tensor:
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    if MIN_RANK_CUSTOM <= lora_rank <= MAX_RANK_CUSTOM:
+        ops.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    else:
+        ops.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    return v
+def lora_b_sgmv_cutlass(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    tmp: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+):
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+def add_lora_a_bgmv(
+    v: torch.Tensor,
+    x: torch.Tensor,
+    wa_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    ops.dispatch_bgmv(v, x, wa_T_all, indicies, layer_idx, 1.0)
+def add_lora_b_bgmv(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    wb_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    ops.dispatch_bgmv(y, v, wb_T_all, indicies, layer_idx, 1.0)
+def pad_rank(t: torch.Tensor, dim: int, world_size: int) -> torch.Tensor:
+    """Pad a tensor to the minimum rank for SGMV and the nearest multiple of the SGMV block size."""
+    # tensor parallelism will result in effective rank being divided by world_size,
+    # so we need to scale the min rank to offset that effect
+    min_rank = MIN_SGMV_RANK * world_size
+    return pad_to_min_rank(t, dim, min_rank)
+def pad_to_min_rank(t: torch.Tensor, dim: int, min_rank: int) -> torch.Tensor:
+    # if we're at or below the min rank, pad up to the min rank
+    # otherwise, pad to the nearest multiple of the block size
+    current_rank = t.size(dim)
+    target_rank = (
+        min_rank
+        if current_rank <= min_rank
+        else (current_rank + SGMV_BLOCK_SIZE - 1) // SGMV_BLOCK_SIZE * SGMV_BLOCK_SIZE
+    )
+    if current_rank == target_rank:
+        return t
+    pad_size = target_rank - current_rank
+    # see complicatd pad syntax here: https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
+    pad = [0, 0] * t.dim()
+    pad[(t.dim() - dim - 1) * 2 + 1] = pad_size
+    pad = tuple(pad)
+    return F.pad(t, pad, mode="constant", value=0.0)
+def use_cutlass_shrink(lora_rank: int) -> bool:
+    return lora_rank < MIN_RANK_CUSTOM
+@lru_cache(maxsize=1)
+def get_tmp_tensor(device: torch.device) -> torch.Tensor:
+    return torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=device)
+@lru_cache(maxsize=32)
+def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Tensor:
+    tmp_size = ops.sgmv_cutlass_tmp_size(size)
+    return torch.empty((tmp_size,), dtype=torch.uint8, device=device)
+def get_tmp_expand_size(size: int) -> int:
+    return ops.sgmv_cutlass_tmp_size(size)
+def get_tmp_tensors(nsegments: int, lora_rank: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+    if use_cutlass_shrink(lora_rank):
+        tmp = get_tmp_tensor_for_size(nsegments, device)
+        return tmp, tmp
+    else:
+        tmp_shrink = get_tmp_tensor(device)
+        tmp_expand = get_tmp_tensor_for_size(nsegments, device)
+        return tmp_shrink, tmp_expand

build/torch27-cxx11-cu126-aarch64-linux/punica_sgmv/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _punica_sgmv_ad0ac7e_dirty
+ops = torch.ops._punica_sgmv_ad0ac7e_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_punica_sgmv_ad0ac7e_dirty::{op_name}"

build/torch27-cxx11-cu126-aarch64-linux/punica_sgmv/_punica_sgmv_ad0ac7e_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b1526b236ab1acc48ece52360332f9e7fbf261e18e4f700b404aa6dbe45240e
+size 14311416

build/torch27-cxx11-cu128-aarch64-linux/punica_sgmv/__init__.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from typing import Optional, Tuple
+from functools import lru_cache
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+MIN_SGMV_RANK = 8
+MIN_RANK_CUSTOM = 16
+MAX_RANK_CUSTOM = 128
+SGMV_BLOCK_SIZE = 16
+BGMV_MAX_RANK = 128
+def orient_for_rank(t: torch.Tensor, rank: int) -> torch.Tensor:
+    if MIN_RANK_CUSTOM <= rank <= MAX_RANK_CUSTOM:
+        return t.transpose(0, 1)
+    return t
+def add_lora_sgmv_cutlass(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.Tensor,
+    s_end: torch.Tensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    """
+    Semantics:
+        y[s[i]:s[i+1]] += x[s[i]:s[i+1]] @ deref(wa_ptr[i]).T @ deref(wb_ptr[i])
+    Args:
+        y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+        x: Shape: `[B, H1]`. Input vectors.
+        wa_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H1]`.
+        wb_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H2]`.
+        s_start: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices start indices.
+        s_end: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices end indices.
+        layer_idx: Layer index of the weight matrices.
+    """
+    if lora_rank < MIN_RANK_CUSTOM or lora_rank > MAX_RANK_CUSTOM:
+        # Custom SGMV shrink only supports rank 16, 32, 64, 128
+        _add_lora_sgmv_cutlass_legacy(y, x, wa_ptr, wb_ptr, s_start, s_end, layer_idx, lora_rank)
+        return
+    tmp1 = torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=x.device)
+    tmp2_size = ops.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp2 = torch.empty((tmp2_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    ops.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp1, layer_idx)
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp2, layer_idx)
+def _add_lora_sgmv_cutlass_legacy(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    tmp_size = ops.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp = torch.empty((tmp_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    ops.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+def lora_a_sgmv_cutlass(
+    x: torch.Tensor,
+    tmp: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+) -> torch.Tensor:
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    if MIN_RANK_CUSTOM <= lora_rank <= MAX_RANK_CUSTOM:
+        ops.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    else:
+        ops.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    return v
+def lora_b_sgmv_cutlass(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    tmp: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+):
+    ops.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+def add_lora_a_bgmv(
+    v: torch.Tensor,
+    x: torch.Tensor,
+    wa_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    ops.dispatch_bgmv(v, x, wa_T_all, indicies, layer_idx, 1.0)
+def add_lora_b_bgmv(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    wb_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    ops.dispatch_bgmv(y, v, wb_T_all, indicies, layer_idx, 1.0)
+def pad_rank(t: torch.Tensor, dim: int, world_size: int) -> torch.Tensor:
+    """Pad a tensor to the minimum rank for SGMV and the nearest multiple of the SGMV block size."""
+    # tensor parallelism will result in effective rank being divided by world_size,
+    # so we need to scale the min rank to offset that effect
+    min_rank = MIN_SGMV_RANK * world_size
+    return pad_to_min_rank(t, dim, min_rank)
+def pad_to_min_rank(t: torch.Tensor, dim: int, min_rank: int) -> torch.Tensor:
+    # if we're at or below the min rank, pad up to the min rank
+    # otherwise, pad to the nearest multiple of the block size
+    current_rank = t.size(dim)
+    target_rank = (
+        min_rank
+        if current_rank <= min_rank
+        else (current_rank + SGMV_BLOCK_SIZE - 1) // SGMV_BLOCK_SIZE * SGMV_BLOCK_SIZE
+    )
+    if current_rank == target_rank:
+        return t
+    pad_size = target_rank - current_rank
+    # see complicatd pad syntax here: https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
+    pad = [0, 0] * t.dim()
+    pad[(t.dim() - dim - 1) * 2 + 1] = pad_size
+    pad = tuple(pad)
+    return F.pad(t, pad, mode="constant", value=0.0)
+def use_cutlass_shrink(lora_rank: int) -> bool:
+    return lora_rank < MIN_RANK_CUSTOM
+@lru_cache(maxsize=1)
+def get_tmp_tensor(device: torch.device) -> torch.Tensor:
+    return torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=device)
+@lru_cache(maxsize=32)
+def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Tensor:
+    tmp_size = ops.sgmv_cutlass_tmp_size(size)
+    return torch.empty((tmp_size,), dtype=torch.uint8, device=device)
+def get_tmp_expand_size(size: int) -> int:
+    return ops.sgmv_cutlass_tmp_size(size)
+def get_tmp_tensors(nsegments: int, lora_rank: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+    if use_cutlass_shrink(lora_rank):
+        tmp = get_tmp_tensor_for_size(nsegments, device)
+        return tmp, tmp
+    else:
+        tmp_shrink = get_tmp_tensor(device)
+        tmp_expand = get_tmp_tensor_for_size(nsegments, device)
+        return tmp_shrink, tmp_expand

build/torch27-cxx11-cu128-aarch64-linux/punica_sgmv/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _punica_sgmv_ad0ac7e_dirty
+ops = torch.ops._punica_sgmv_ad0ac7e_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_punica_sgmv_ad0ac7e_dirty::{op_name}"

build/torch27-cxx11-cu128-aarch64-linux/punica_sgmv/_punica_sgmv_ad0ac7e_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1681b08e9b39010e07ed293cd48fa54910a8811c01bd06acc235742660efb766
+size 22831040