Commit
·
cf68df1
1
Parent(s):
605f22e
feat: add cuda build
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- build.toml +15 -1
- build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py +30 -0
- build/{torch26-cxx11-rocm62-x86_64-linux/activation/_activation_f3b99fb_dirty.abi3.so → torch26-cxx11-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so} +2 -2
- build/torch26-cxx11-cu118-x86_64-linux/activation/_ops.py +9 -0
- build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py +46 -0
- build/torch26-cxx11-cu118-x86_64-linux/activation/poly_norm.py +41 -0
- build/torch26-cxx11-cu118-x86_64-linux/activation/rms_norm.py +34 -0
- build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py +30 -0
- build/{torch27-cxx11-rocm63-x86_64-linux/activation/_activation_f3b99fb_dirty.abi3.so → torch26-cxx11-cu124-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so} +2 -2
- build/torch26-cxx11-cu124-x86_64-linux/activation/_ops.py +9 -0
- build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py +46 -0
- build/torch26-cxx11-cu124-x86_64-linux/activation/poly_norm.py +41 -0
- build/torch26-cxx11-cu124-x86_64-linux/activation/rms_norm.py +34 -0
- build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py +30 -0
- build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so +3 -0
- build/torch26-cxx11-cu126-x86_64-linux/activation/_ops.py +9 -0
- build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py +46 -0
- build/torch26-cxx11-cu126-x86_64-linux/activation/poly_norm.py +41 -0
- build/torch26-cxx11-cu126-x86_64-linux/activation/rms_norm.py +34 -0
- build/torch26-cxx11-rocm62-x86_64-linux/activation/__init__.py +0 -0
- build/torch26-cxx11-rocm62-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so +3 -0
- build/torch26-cxx11-rocm62-x86_64-linux/activation/_ops.py +3 -3
- build/torch26-cxx11-rocm62-x86_64-linux/activation/layers.py +0 -0
- build/torch26-cxx11-rocm62-x86_64-linux/activation/poly_norm.py +0 -0
- build/torch26-cxx11-rocm62-x86_64-linux/activation/rms_norm.py +0 -0
- build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py +30 -0
- build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so +3 -0
- build/torch26-cxx98-cu118-x86_64-linux/activation/_ops.py +9 -0
- build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py +46 -0
- build/torch26-cxx98-cu118-x86_64-linux/activation/poly_norm.py +41 -0
- build/torch26-cxx98-cu118-x86_64-linux/activation/rms_norm.py +34 -0
- build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py +30 -0
- build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so +3 -0
- build/torch26-cxx98-cu124-x86_64-linux/activation/_ops.py +9 -0
- build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py +46 -0
- build/torch26-cxx98-cu124-x86_64-linux/activation/poly_norm.py +41 -0
- build/torch26-cxx98-cu124-x86_64-linux/activation/rms_norm.py +34 -0
- build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py +30 -0
- build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so +3 -0
- build/torch26-cxx98-cu126-x86_64-linux/activation/_ops.py +9 -0
- build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py +46 -0
- build/torch26-cxx98-cu126-x86_64-linux/activation/poly_norm.py +41 -0
- build/torch26-cxx98-cu126-x86_64-linux/activation/rms_norm.py +34 -0
- build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py +30 -0
- build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so +3 -0
- build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py +9 -0
- build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py +46 -0
- build/torch27-cxx11-cu118-x86_64-linux/activation/poly_norm.py +41 -0
- build/torch27-cxx11-cu118-x86_64-linux/activation/rms_norm.py +34 -0
- build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py +30 -0
build.toml
CHANGED
|
@@ -10,7 +10,7 @@ src = [
|
|
| 10 |
|
| 11 |
[kernel.activation]
|
| 12 |
backend = "rocm"
|
| 13 |
-
rocm-archs = [ "gfx90a" ]
|
| 14 |
src = [
|
| 15 |
"activation/poly_norm.cu",
|
| 16 |
"activation/rms_norm.cu",
|
|
@@ -21,3 +21,17 @@ src = [
|
|
| 21 |
"activation/atomic_utils.h",
|
| 22 |
]
|
| 23 |
depends = [ "torch" ]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
[kernel.activation]
|
| 12 |
backend = "rocm"
|
| 13 |
+
rocm-archs = [ "gfx90a", "gfx942" ]
|
| 14 |
src = [
|
| 15 |
"activation/poly_norm.cu",
|
| 16 |
"activation/rms_norm.cu",
|
|
|
|
| 21 |
"activation/atomic_utils.h",
|
| 22 |
]
|
| 23 |
depends = [ "torch" ]
|
| 24 |
+
|
| 25 |
+
[kernel.activation_cuda]
|
| 26 |
+
backend = "cuda"
|
| 27 |
+
src = [
|
| 28 |
+
"activation/poly_norm.cu",
|
| 29 |
+
"activation/rms_norm.cu",
|
| 30 |
+
"activation/cuda_compat.h",
|
| 31 |
+
"activation/block_reduce.h",
|
| 32 |
+
"activation/dispatch_utils.h",
|
| 33 |
+
"activation/assert_utils.h",
|
| 34 |
+
"activation/atomic_utils.h",
|
| 35 |
+
]
|
| 36 |
+
depends = ["torch"]
|
| 37 |
+
|
build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from . import layers
|
| 4 |
+
from ._ops import ops
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def poly_norm(
|
| 10 |
+
x: torch.Tensor,
|
| 11 |
+
weight: torch.Tensor,
|
| 12 |
+
bias: torch.Tensor,
|
| 13 |
+
eps: float = 1e-6,
|
| 14 |
+
) -> None:
|
| 15 |
+
return PolyNormFunction.apply(x, weight, bias, eps)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def rms_norm(
|
| 19 |
+
x: torch.Tensor,
|
| 20 |
+
weight: torch.Tensor,
|
| 21 |
+
eps: float = 1e-6,
|
| 22 |
+
) -> None:
|
| 23 |
+
return RMSNormFunction.apply(x, weight, eps)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
__all__ = [
|
| 27 |
+
"poly_norm",
|
| 28 |
+
"layers",
|
| 29 |
+
"ops",
|
| 30 |
+
]
|
build/{torch26-cxx11-rocm62-x86_64-linux/activation/_activation_f3b99fb_dirty.abi3.so → torch26-cxx11-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2fdb7378a1c907c3ff3ad0a5134a0a8ce4a464196404436470d7b4eb77ec305
|
| 3 |
+
size 2957296
|
build/torch26-cxx11-cu118-x86_64-linux/activation/_ops.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from . import _activation_605f22e_dirty
|
| 3 |
+
ops = torch.ops._activation_605f22e_dirty
|
| 4 |
+
|
| 5 |
+
def add_op_namespace_prefix(op_name: str):
|
| 6 |
+
"""
|
| 7 |
+
Prefix op by namespace.
|
| 8 |
+
"""
|
| 9 |
+
return f"_activation_605f22e_dirty::{op_name}"
|
build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from torch.nn import init
|
| 4 |
+
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class PolyNorm(nn.Module):
|
| 10 |
+
def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
|
| 13 |
+
self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
|
| 14 |
+
self.eps = eps
|
| 15 |
+
|
| 16 |
+
def forward(
|
| 17 |
+
self,
|
| 18 |
+
x: torch.Tensor,
|
| 19 |
+
):
|
| 20 |
+
return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
|
| 21 |
+
|
| 22 |
+
def reset_parameters(self) -> None:
|
| 23 |
+
"""
|
| 24 |
+
Resets parameters based on their initialization used in __init__.
|
| 25 |
+
"""
|
| 26 |
+
init.ones_(self.weight)
|
| 27 |
+
init.zeros_(self.bias)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class RMSNorm(nn.Module):
|
| 31 |
+
def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 32 |
+
super().__init__()
|
| 33 |
+
self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
|
| 34 |
+
self.eps = eps
|
| 35 |
+
|
| 36 |
+
def forward(
|
| 37 |
+
self,
|
| 38 |
+
x: torch.Tensor,
|
| 39 |
+
):
|
| 40 |
+
return RMSNormFunction.apply(x, self.weight, self.eps)
|
| 41 |
+
|
| 42 |
+
def reset_parameters(self) -> None:
|
| 43 |
+
"""
|
| 44 |
+
Resets parameters based on their initialization used in __init__.
|
| 45 |
+
"""
|
| 46 |
+
init.ones_(self.weight)
|
build/torch26-cxx11-cu118-x86_64-linux/activation/poly_norm.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class PolyNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, bias, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.poly_norm(output, input, weight, bias, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, bias, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
bias_grad = (
|
| 32 |
+
torch.empty(1, dtype=weight.dtype, device=weight.device)
|
| 33 |
+
if ctx.needs_input_grad[2]
|
| 34 |
+
else None
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
ops.poly_norm_backward(
|
| 38 |
+
input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
return input_grad, weight_grad, bias_grad, None
|
build/torch26-cxx11-cu118-x86_64-linux/activation/rms_norm.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class RMSNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.rms_norm(output, input, weight, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
|
| 32 |
+
ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
|
| 33 |
+
|
| 34 |
+
return input_grad, weight_grad, None
|
build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from . import layers
|
| 4 |
+
from ._ops import ops
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def poly_norm(
|
| 10 |
+
x: torch.Tensor,
|
| 11 |
+
weight: torch.Tensor,
|
| 12 |
+
bias: torch.Tensor,
|
| 13 |
+
eps: float = 1e-6,
|
| 14 |
+
) -> None:
|
| 15 |
+
return PolyNormFunction.apply(x, weight, bias, eps)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def rms_norm(
|
| 19 |
+
x: torch.Tensor,
|
| 20 |
+
weight: torch.Tensor,
|
| 21 |
+
eps: float = 1e-6,
|
| 22 |
+
) -> None:
|
| 23 |
+
return RMSNormFunction.apply(x, weight, eps)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
__all__ = [
|
| 27 |
+
"poly_norm",
|
| 28 |
+
"layers",
|
| 29 |
+
"ops",
|
| 30 |
+
]
|
build/{torch27-cxx11-rocm63-x86_64-linux/activation/_activation_f3b99fb_dirty.abi3.so → torch26-cxx11-cu124-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5baac6228e04fbb209cbc90a24702c14f4eb52d2698cea12a766d77412622096
|
| 3 |
+
size 2981880
|
build/torch26-cxx11-cu124-x86_64-linux/activation/_ops.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from . import _activation_605f22e_dirty
|
| 3 |
+
ops = torch.ops._activation_605f22e_dirty
|
| 4 |
+
|
| 5 |
+
def add_op_namespace_prefix(op_name: str):
|
| 6 |
+
"""
|
| 7 |
+
Prefix op by namespace.
|
| 8 |
+
"""
|
| 9 |
+
return f"_activation_605f22e_dirty::{op_name}"
|
build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from torch.nn import init
|
| 4 |
+
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class PolyNorm(nn.Module):
|
| 10 |
+
def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
|
| 13 |
+
self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
|
| 14 |
+
self.eps = eps
|
| 15 |
+
|
| 16 |
+
def forward(
|
| 17 |
+
self,
|
| 18 |
+
x: torch.Tensor,
|
| 19 |
+
):
|
| 20 |
+
return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
|
| 21 |
+
|
| 22 |
+
def reset_parameters(self) -> None:
|
| 23 |
+
"""
|
| 24 |
+
Resets parameters based on their initialization used in __init__.
|
| 25 |
+
"""
|
| 26 |
+
init.ones_(self.weight)
|
| 27 |
+
init.zeros_(self.bias)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class RMSNorm(nn.Module):
|
| 31 |
+
def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 32 |
+
super().__init__()
|
| 33 |
+
self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
|
| 34 |
+
self.eps = eps
|
| 35 |
+
|
| 36 |
+
def forward(
|
| 37 |
+
self,
|
| 38 |
+
x: torch.Tensor,
|
| 39 |
+
):
|
| 40 |
+
return RMSNormFunction.apply(x, self.weight, self.eps)
|
| 41 |
+
|
| 42 |
+
def reset_parameters(self) -> None:
|
| 43 |
+
"""
|
| 44 |
+
Resets parameters based on their initialization used in __init__.
|
| 45 |
+
"""
|
| 46 |
+
init.ones_(self.weight)
|
build/torch26-cxx11-cu124-x86_64-linux/activation/poly_norm.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class PolyNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, bias, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.poly_norm(output, input, weight, bias, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, bias, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
bias_grad = (
|
| 32 |
+
torch.empty(1, dtype=weight.dtype, device=weight.device)
|
| 33 |
+
if ctx.needs_input_grad[2]
|
| 34 |
+
else None
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
ops.poly_norm_backward(
|
| 38 |
+
input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
return input_grad, weight_grad, bias_grad, None
|
build/torch26-cxx11-cu124-x86_64-linux/activation/rms_norm.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class RMSNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.rms_norm(output, input, weight, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
|
| 32 |
+
ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
|
| 33 |
+
|
| 34 |
+
return input_grad, weight_grad, None
|
build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from . import layers
|
| 4 |
+
from ._ops import ops
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def poly_norm(
|
| 10 |
+
x: torch.Tensor,
|
| 11 |
+
weight: torch.Tensor,
|
| 12 |
+
bias: torch.Tensor,
|
| 13 |
+
eps: float = 1e-6,
|
| 14 |
+
) -> None:
|
| 15 |
+
return PolyNormFunction.apply(x, weight, bias, eps)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def rms_norm(
|
| 19 |
+
x: torch.Tensor,
|
| 20 |
+
weight: torch.Tensor,
|
| 21 |
+
eps: float = 1e-6,
|
| 22 |
+
) -> None:
|
| 23 |
+
return RMSNormFunction.apply(x, weight, eps)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
__all__ = [
|
| 27 |
+
"poly_norm",
|
| 28 |
+
"layers",
|
| 29 |
+
"ops",
|
| 30 |
+
]
|
build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d5c0095b931923008435d361c1871e97ff2ef04100e93205f09e65316f307f3
|
| 3 |
+
size 2994704
|
build/torch26-cxx11-cu126-x86_64-linux/activation/_ops.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from . import _activation_605f22e_dirty
|
| 3 |
+
ops = torch.ops._activation_605f22e_dirty
|
| 4 |
+
|
| 5 |
+
def add_op_namespace_prefix(op_name: str):
|
| 6 |
+
"""
|
| 7 |
+
Prefix op by namespace.
|
| 8 |
+
"""
|
| 9 |
+
return f"_activation_605f22e_dirty::{op_name}"
|
build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from torch.nn import init
|
| 4 |
+
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class PolyNorm(nn.Module):
|
| 10 |
+
def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
|
| 13 |
+
self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
|
| 14 |
+
self.eps = eps
|
| 15 |
+
|
| 16 |
+
def forward(
|
| 17 |
+
self,
|
| 18 |
+
x: torch.Tensor,
|
| 19 |
+
):
|
| 20 |
+
return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
|
| 21 |
+
|
| 22 |
+
def reset_parameters(self) -> None:
|
| 23 |
+
"""
|
| 24 |
+
Resets parameters based on their initialization used in __init__.
|
| 25 |
+
"""
|
| 26 |
+
init.ones_(self.weight)
|
| 27 |
+
init.zeros_(self.bias)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class RMSNorm(nn.Module):
|
| 31 |
+
def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 32 |
+
super().__init__()
|
| 33 |
+
self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
|
| 34 |
+
self.eps = eps
|
| 35 |
+
|
| 36 |
+
def forward(
|
| 37 |
+
self,
|
| 38 |
+
x: torch.Tensor,
|
| 39 |
+
):
|
| 40 |
+
return RMSNormFunction.apply(x, self.weight, self.eps)
|
| 41 |
+
|
| 42 |
+
def reset_parameters(self) -> None:
|
| 43 |
+
"""
|
| 44 |
+
Resets parameters based on their initialization used in __init__.
|
| 45 |
+
"""
|
| 46 |
+
init.ones_(self.weight)
|
build/torch26-cxx11-cu126-x86_64-linux/activation/poly_norm.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class PolyNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, bias, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.poly_norm(output, input, weight, bias, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, bias, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
bias_grad = (
|
| 32 |
+
torch.empty(1, dtype=weight.dtype, device=weight.device)
|
| 33 |
+
if ctx.needs_input_grad[2]
|
| 34 |
+
else None
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
ops.poly_norm_backward(
|
| 38 |
+
input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
return input_grad, weight_grad, bias_grad, None
|
build/torch26-cxx11-cu126-x86_64-linux/activation/rms_norm.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class RMSNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.rms_norm(output, input, weight, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
|
| 32 |
+
ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
|
| 33 |
+
|
| 34 |
+
return input_grad, weight_grad, None
|
build/torch26-cxx11-rocm62-x86_64-linux/activation/__init__.py
CHANGED
|
File without changes
|
build/torch26-cxx11-rocm62-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c29fccf3f62ac3e3b7ff59e898d31ae38f3484bfe762f6767b8bc8cedf1af01
|
| 3 |
+
size 2660632
|
build/torch26-cxx11-rocm62-x86_64-linux/activation/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _activation_605f22e_dirty
|
| 3 |
+
ops = torch.ops._activation_605f22e_dirty
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_activation_605f22e_dirty::{op_name}"
|
build/torch26-cxx11-rocm62-x86_64-linux/activation/layers.py
CHANGED
|
File without changes
|
build/torch26-cxx11-rocm62-x86_64-linux/activation/poly_norm.py
CHANGED
|
File without changes
|
build/torch26-cxx11-rocm62-x86_64-linux/activation/rms_norm.py
CHANGED
|
File without changes
|
build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from . import layers
|
| 4 |
+
from ._ops import ops
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def poly_norm(
|
| 10 |
+
x: torch.Tensor,
|
| 11 |
+
weight: torch.Tensor,
|
| 12 |
+
bias: torch.Tensor,
|
| 13 |
+
eps: float = 1e-6,
|
| 14 |
+
) -> None:
|
| 15 |
+
return PolyNormFunction.apply(x, weight, bias, eps)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def rms_norm(
|
| 19 |
+
x: torch.Tensor,
|
| 20 |
+
weight: torch.Tensor,
|
| 21 |
+
eps: float = 1e-6,
|
| 22 |
+
) -> None:
|
| 23 |
+
return RMSNormFunction.apply(x, weight, eps)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
__all__ = [
|
| 27 |
+
"poly_norm",
|
| 28 |
+
"layers",
|
| 29 |
+
"ops",
|
| 30 |
+
]
|
build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:176b8610ed2b9650c68347ec2f1d9e99b653170b4fd4f6f3540731f3fd78e98b
|
| 3 |
+
size 2949936
|
build/torch26-cxx98-cu118-x86_64-linux/activation/_ops.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from . import _activation_605f22e_dirty
|
| 3 |
+
ops = torch.ops._activation_605f22e_dirty
|
| 4 |
+
|
| 5 |
+
def add_op_namespace_prefix(op_name: str):
|
| 6 |
+
"""
|
| 7 |
+
Prefix op by namespace.
|
| 8 |
+
"""
|
| 9 |
+
return f"_activation_605f22e_dirty::{op_name}"
|
build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from torch.nn import init
|
| 4 |
+
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class PolyNorm(nn.Module):
|
| 10 |
+
def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
|
| 13 |
+
self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
|
| 14 |
+
self.eps = eps
|
| 15 |
+
|
| 16 |
+
def forward(
|
| 17 |
+
self,
|
| 18 |
+
x: torch.Tensor,
|
| 19 |
+
):
|
| 20 |
+
return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
|
| 21 |
+
|
| 22 |
+
def reset_parameters(self) -> None:
|
| 23 |
+
"""
|
| 24 |
+
Resets parameters based on their initialization used in __init__.
|
| 25 |
+
"""
|
| 26 |
+
init.ones_(self.weight)
|
| 27 |
+
init.zeros_(self.bias)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class RMSNorm(nn.Module):
|
| 31 |
+
def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 32 |
+
super().__init__()
|
| 33 |
+
self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
|
| 34 |
+
self.eps = eps
|
| 35 |
+
|
| 36 |
+
def forward(
|
| 37 |
+
self,
|
| 38 |
+
x: torch.Tensor,
|
| 39 |
+
):
|
| 40 |
+
return RMSNormFunction.apply(x, self.weight, self.eps)
|
| 41 |
+
|
| 42 |
+
def reset_parameters(self) -> None:
|
| 43 |
+
"""
|
| 44 |
+
Resets parameters based on their initialization used in __init__.
|
| 45 |
+
"""
|
| 46 |
+
init.ones_(self.weight)
|
build/torch26-cxx98-cu118-x86_64-linux/activation/poly_norm.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class PolyNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, bias, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.poly_norm(output, input, weight, bias, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, bias, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
bias_grad = (
|
| 32 |
+
torch.empty(1, dtype=weight.dtype, device=weight.device)
|
| 33 |
+
if ctx.needs_input_grad[2]
|
| 34 |
+
else None
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
ops.poly_norm_backward(
|
| 38 |
+
input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
return input_grad, weight_grad, bias_grad, None
|
build/torch26-cxx98-cu118-x86_64-linux/activation/rms_norm.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class RMSNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.rms_norm(output, input, weight, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
|
| 32 |
+
ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
|
| 33 |
+
|
| 34 |
+
return input_grad, weight_grad, None
|
build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from . import layers
|
| 4 |
+
from ._ops import ops
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def poly_norm(
|
| 10 |
+
x: torch.Tensor,
|
| 11 |
+
weight: torch.Tensor,
|
| 12 |
+
bias: torch.Tensor,
|
| 13 |
+
eps: float = 1e-6,
|
| 14 |
+
) -> None:
|
| 15 |
+
return PolyNormFunction.apply(x, weight, bias, eps)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def rms_norm(
|
| 19 |
+
x: torch.Tensor,
|
| 20 |
+
weight: torch.Tensor,
|
| 21 |
+
eps: float = 1e-6,
|
| 22 |
+
) -> None:
|
| 23 |
+
return RMSNormFunction.apply(x, weight, eps)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
__all__ = [
|
| 27 |
+
"poly_norm",
|
| 28 |
+
"layers",
|
| 29 |
+
"ops",
|
| 30 |
+
]
|
build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8075bbb5b339e0305d353003eb86a2b6a4d8a468907d821cefbed29e6e439c19
|
| 3 |
+
size 2974640
|
build/torch26-cxx98-cu124-x86_64-linux/activation/_ops.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from . import _activation_605f22e_dirty
|
| 3 |
+
ops = torch.ops._activation_605f22e_dirty
|
| 4 |
+
|
| 5 |
+
def add_op_namespace_prefix(op_name: str):
|
| 6 |
+
"""
|
| 7 |
+
Prefix op by namespace.
|
| 8 |
+
"""
|
| 9 |
+
return f"_activation_605f22e_dirty::{op_name}"
|
build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from torch.nn import init
|
| 4 |
+
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class PolyNorm(nn.Module):
|
| 10 |
+
def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
|
| 13 |
+
self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
|
| 14 |
+
self.eps = eps
|
| 15 |
+
|
| 16 |
+
def forward(
|
| 17 |
+
self,
|
| 18 |
+
x: torch.Tensor,
|
| 19 |
+
):
|
| 20 |
+
return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
|
| 21 |
+
|
| 22 |
+
def reset_parameters(self) -> None:
|
| 23 |
+
"""
|
| 24 |
+
Resets parameters based on their initialization used in __init__.
|
| 25 |
+
"""
|
| 26 |
+
init.ones_(self.weight)
|
| 27 |
+
init.zeros_(self.bias)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class RMSNorm(nn.Module):
|
| 31 |
+
def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 32 |
+
super().__init__()
|
| 33 |
+
self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
|
| 34 |
+
self.eps = eps
|
| 35 |
+
|
| 36 |
+
def forward(
|
| 37 |
+
self,
|
| 38 |
+
x: torch.Tensor,
|
| 39 |
+
):
|
| 40 |
+
return RMSNormFunction.apply(x, self.weight, self.eps)
|
| 41 |
+
|
| 42 |
+
def reset_parameters(self) -> None:
|
| 43 |
+
"""
|
| 44 |
+
Resets parameters based on their initialization used in __init__.
|
| 45 |
+
"""
|
| 46 |
+
init.ones_(self.weight)
|
build/torch26-cxx98-cu124-x86_64-linux/activation/poly_norm.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class PolyNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, bias, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.poly_norm(output, input, weight, bias, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, bias, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
bias_grad = (
|
| 32 |
+
torch.empty(1, dtype=weight.dtype, device=weight.device)
|
| 33 |
+
if ctx.needs_input_grad[2]
|
| 34 |
+
else None
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
ops.poly_norm_backward(
|
| 38 |
+
input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
return input_grad, weight_grad, bias_grad, None
|
build/torch26-cxx98-cu124-x86_64-linux/activation/rms_norm.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class RMSNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.rms_norm(output, input, weight, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
|
| 32 |
+
ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
|
| 33 |
+
|
| 34 |
+
return input_grad, weight_grad, None
|
build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from . import layers
|
| 4 |
+
from ._ops import ops
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def poly_norm(
|
| 10 |
+
x: torch.Tensor,
|
| 11 |
+
weight: torch.Tensor,
|
| 12 |
+
bias: torch.Tensor,
|
| 13 |
+
eps: float = 1e-6,
|
| 14 |
+
) -> None:
|
| 15 |
+
return PolyNormFunction.apply(x, weight, bias, eps)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def rms_norm(
|
| 19 |
+
x: torch.Tensor,
|
| 20 |
+
weight: torch.Tensor,
|
| 21 |
+
eps: float = 1e-6,
|
| 22 |
+
) -> None:
|
| 23 |
+
return RMSNormFunction.apply(x, weight, eps)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
__all__ = [
|
| 27 |
+
"poly_norm",
|
| 28 |
+
"layers",
|
| 29 |
+
"ops",
|
| 30 |
+
]
|
build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:614ef2bf9867f65bf8e09d861def1c554d384676aa58dfbfd73bf96241cb7171
|
| 3 |
+
size 2987456
|
build/torch26-cxx98-cu126-x86_64-linux/activation/_ops.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from . import _activation_605f22e_dirty
|
| 3 |
+
ops = torch.ops._activation_605f22e_dirty
|
| 4 |
+
|
| 5 |
+
def add_op_namespace_prefix(op_name: str):
|
| 6 |
+
"""
|
| 7 |
+
Prefix op by namespace.
|
| 8 |
+
"""
|
| 9 |
+
return f"_activation_605f22e_dirty::{op_name}"
|
build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from torch.nn import init
|
| 4 |
+
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class PolyNorm(nn.Module):
|
| 10 |
+
def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
|
| 13 |
+
self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
|
| 14 |
+
self.eps = eps
|
| 15 |
+
|
| 16 |
+
def forward(
|
| 17 |
+
self,
|
| 18 |
+
x: torch.Tensor,
|
| 19 |
+
):
|
| 20 |
+
return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
|
| 21 |
+
|
| 22 |
+
def reset_parameters(self) -> None:
|
| 23 |
+
"""
|
| 24 |
+
Resets parameters based on their initialization used in __init__.
|
| 25 |
+
"""
|
| 26 |
+
init.ones_(self.weight)
|
| 27 |
+
init.zeros_(self.bias)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class RMSNorm(nn.Module):
|
| 31 |
+
def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 32 |
+
super().__init__()
|
| 33 |
+
self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
|
| 34 |
+
self.eps = eps
|
| 35 |
+
|
| 36 |
+
def forward(
|
| 37 |
+
self,
|
| 38 |
+
x: torch.Tensor,
|
| 39 |
+
):
|
| 40 |
+
return RMSNormFunction.apply(x, self.weight, self.eps)
|
| 41 |
+
|
| 42 |
+
def reset_parameters(self) -> None:
|
| 43 |
+
"""
|
| 44 |
+
Resets parameters based on their initialization used in __init__.
|
| 45 |
+
"""
|
| 46 |
+
init.ones_(self.weight)
|
build/torch26-cxx98-cu126-x86_64-linux/activation/poly_norm.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class PolyNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, bias, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.poly_norm(output, input, weight, bias, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, bias, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
bias_grad = (
|
| 32 |
+
torch.empty(1, dtype=weight.dtype, device=weight.device)
|
| 33 |
+
if ctx.needs_input_grad[2]
|
| 34 |
+
else None
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
ops.poly_norm_backward(
|
| 38 |
+
input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
return input_grad, weight_grad, bias_grad, None
|
build/torch26-cxx98-cu126-x86_64-linux/activation/rms_norm.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class RMSNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.rms_norm(output, input, weight, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
|
| 32 |
+
ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
|
| 33 |
+
|
| 34 |
+
return input_grad, weight_grad, None
|
build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from . import layers
|
| 4 |
+
from ._ops import ops
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def poly_norm(
|
| 10 |
+
x: torch.Tensor,
|
| 11 |
+
weight: torch.Tensor,
|
| 12 |
+
bias: torch.Tensor,
|
| 13 |
+
eps: float = 1e-6,
|
| 14 |
+
) -> None:
|
| 15 |
+
return PolyNormFunction.apply(x, weight, bias, eps)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def rms_norm(
|
| 19 |
+
x: torch.Tensor,
|
| 20 |
+
weight: torch.Tensor,
|
| 21 |
+
eps: float = 1e-6,
|
| 22 |
+
) -> None:
|
| 23 |
+
return RMSNormFunction.apply(x, weight, eps)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
__all__ = [
|
| 27 |
+
"poly_norm",
|
| 28 |
+
"layers",
|
| 29 |
+
"ops",
|
| 30 |
+
]
|
build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:654d16d66565185dfd1a6f16e0b24d8fff83e12558c8862c322734e6b52e5cc0
|
| 3 |
+
size 2957448
|
build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from . import _activation_605f22e_dirty
|
| 3 |
+
ops = torch.ops._activation_605f22e_dirty
|
| 4 |
+
|
| 5 |
+
def add_op_namespace_prefix(op_name: str):
|
| 6 |
+
"""
|
| 7 |
+
Prefix op by namespace.
|
| 8 |
+
"""
|
| 9 |
+
return f"_activation_605f22e_dirty::{op_name}"
|
build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from torch.nn import init
|
| 4 |
+
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class PolyNorm(nn.Module):
|
| 10 |
+
def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
|
| 13 |
+
self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
|
| 14 |
+
self.eps = eps
|
| 15 |
+
|
| 16 |
+
def forward(
|
| 17 |
+
self,
|
| 18 |
+
x: torch.Tensor,
|
| 19 |
+
):
|
| 20 |
+
return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
|
| 21 |
+
|
| 22 |
+
def reset_parameters(self) -> None:
|
| 23 |
+
"""
|
| 24 |
+
Resets parameters based on their initialization used in __init__.
|
| 25 |
+
"""
|
| 26 |
+
init.ones_(self.weight)
|
| 27 |
+
init.zeros_(self.bias)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class RMSNorm(nn.Module):
|
| 31 |
+
def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
|
| 32 |
+
super().__init__()
|
| 33 |
+
self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
|
| 34 |
+
self.eps = eps
|
| 35 |
+
|
| 36 |
+
def forward(
|
| 37 |
+
self,
|
| 38 |
+
x: torch.Tensor,
|
| 39 |
+
):
|
| 40 |
+
return RMSNormFunction.apply(x, self.weight, self.eps)
|
| 41 |
+
|
| 42 |
+
def reset_parameters(self) -> None:
|
| 43 |
+
"""
|
| 44 |
+
Resets parameters based on their initialization used in __init__.
|
| 45 |
+
"""
|
| 46 |
+
init.ones_(self.weight)
|
build/torch27-cxx11-cu118-x86_64-linux/activation/poly_norm.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class PolyNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, bias, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.poly_norm(output, input, weight, bias, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, bias, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
bias_grad = (
|
| 32 |
+
torch.empty(1, dtype=weight.dtype, device=weight.device)
|
| 33 |
+
if ctx.needs_input_grad[2]
|
| 34 |
+
else None
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
ops.poly_norm_backward(
|
| 38 |
+
input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
return input_grad, weight_grad, bias_grad, None
|
build/torch27-cxx11-cu118-x86_64-linux/activation/rms_norm.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from ._ops import ops
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Inherit from Function
|
| 7 |
+
class RMSNormFunction(torch.autograd.Function):
|
| 8 |
+
# Note that forward, setup_context, and backward are @staticmethods
|
| 9 |
+
@staticmethod
|
| 10 |
+
def forward(input, weight, eps):
|
| 11 |
+
output = torch.empty_like(input)
|
| 12 |
+
ops.rms_norm(output, input, weight, eps)
|
| 13 |
+
return output
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
# inputs is a Tuple of all of the inputs passed to forward.
|
| 17 |
+
# output is the output of the forward().
|
| 18 |
+
def setup_context(ctx, inputs, output):
|
| 19 |
+
input, weight, eps = inputs
|
| 20 |
+
ctx.save_for_backward(input, weight)
|
| 21 |
+
ctx.eps = eps
|
| 22 |
+
|
| 23 |
+
# This function has only a single output, so it gets only one gradient
|
| 24 |
+
@staticmethod
|
| 25 |
+
def backward(ctx, output_grad):
|
| 26 |
+
input, weight = ctx.saved_tensors
|
| 27 |
+
eps = ctx.eps
|
| 28 |
+
|
| 29 |
+
input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
|
| 30 |
+
weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
|
| 31 |
+
|
| 32 |
+
ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
|
| 33 |
+
|
| 34 |
+
return input_grad, weight_grad, None
|
build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from . import layers
|
| 4 |
+
from ._ops import ops
|
| 5 |
+
from .poly_norm import PolyNormFunction
|
| 6 |
+
from .rms_norm import RMSNormFunction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def poly_norm(
|
| 10 |
+
x: torch.Tensor,
|
| 11 |
+
weight: torch.Tensor,
|
| 12 |
+
bias: torch.Tensor,
|
| 13 |
+
eps: float = 1e-6,
|
| 14 |
+
) -> None:
|
| 15 |
+
return PolyNormFunction.apply(x, weight, bias, eps)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def rms_norm(
|
| 19 |
+
x: torch.Tensor,
|
| 20 |
+
weight: torch.Tensor,
|
| 21 |
+
eps: float = 1e-6,
|
| 22 |
+
) -> None:
|
| 23 |
+
return RMSNormFunction.apply(x, weight, eps)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
__all__ = [
|
| 27 |
+
"poly_norm",
|
| 28 |
+
"layers",
|
| 29 |
+
"ops",
|
| 30 |
+
]
|