Upload custom kernels

Browse files

Files changed (10) hide show

build/torch-universal/triton_llama_attn/attn.py +46 -46
torch-ext/triton_llama_attn/.pytest_cache/.gitignore +2 -0
torch-ext/triton_llama_attn/.pytest_cache/CACHEDIR.TAG +4 -0
torch-ext/triton_llama_attn/.pytest_cache/README.md +8 -0
torch-ext/triton_llama_attn/.pytest_cache/v/cache/lastfailed +1 -0
torch-ext/triton_llama_attn/.pytest_cache/v/cache/nodeids +3 -0
torch-ext/triton_llama_attn/.pytest_cache/v/cache/stepwise +1 -0
torch-ext/triton_llama_attn/__pycache__/__init__.cpython-310.pyc +0 -0
torch-ext/triton_llama_attn/__pycache__/attn.cpython-310-pytest-8.3.5.pyc +0 -0
torch-ext/triton_llama_attn/attn.py +47 -46

build/torch-universal/triton_llama_attn/attn.py CHANGED Viewed

@@ -144,7 +144,7 @@ def _attn_fwd_inner(acc, l_i, m_i, q,  #
         if fp8_v:
             p = p.to(tl.float8e5)
         else:
-            p = p.to(tl.float16)
         acc = tl.dot(p, v, acc)
         # update m_i and l_i
         m_i = m_ij
@@ -344,7 +344,7 @@ def _attn_fwd_tma(sm_scale, M,  #
                   FP8_OUTPUT: tl.constexpr,  #
                   STAGE: tl.constexpr  #
                   ):
-    dtype = tl.float8e5 if FP8_OUTPUT else tl.float16
     tl.static_assert(BLOCK_N <= HEAD_DIM)
     start_m = tl.program_id(0)
     off_hz = tl.program_id(1)
@@ -447,14 +447,14 @@ def _attn_bwd_dkdv(dk, dv,  #
         do = tl.load(do_ptrs)
         # Compute dV.
         ppT = pT
-        ppT = ppT.to(tl.float16)
         dv += tl.dot(ppT, do)
         # D (= delta) is pre-divided by ds_scale.
         Di = tl.load(D + offs_m)
         # Compute dP and dS.
         dpT = tl.dot(v, tl.trans(do)).to(tl.float32)
         dsT = pT * (dpT - Di[None, :])
-        dsT = dsT.to(tl.float16)
         dk += tl.dot(dsT, tl.trans(qT))
         # Increment pointers.
         curr_m += step_m
@@ -500,7 +500,7 @@ def _attn_bwd_dq(dq, q, K, V,  #
         # Compute dP and dS.
         dp = tl.dot(do, vT).to(tl.float32)
         ds = p * (dp - Di[:, None])
-        ds = ds.to(tl.float16)
         # Compute dQ.
         # NOTE: We need to de-scale dq in the end, because kT was pre-scaled.
         dq += tl.dot(ds, tl.trans(kT))
@@ -1106,44 +1106,44 @@ def attn_forward_kernel(
 #     return is_close
-# attention = Attention.apply
-# DEVICE = "cuda:0"
-# import pytest
-# @pytest.mark.parametrize("Z, H, N_CTX, HEAD_DIM", [(1, 2, 1024, 64)])
-# @pytest.mark.parametrize("causal", [True])
-# def test_op(Z, H, N_CTX, HEAD_DIM, causal, dtype=torch.float16):
-#     torch.manual_seed(20)
-#     q = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
-#     k = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
-#     v = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
-#     sm_scale = 0.5
-#     dout = torch.randn_like(q)
-#     # reference implementation
-#     M = torch.tril(torch.ones((N_CTX, N_CTX), device=DEVICE))
-#     p = torch.matmul(q, k.transpose(2, 3)) * sm_scale
-#     if causal:
-#         p[:, :, M == 0] = float("-inf")
-#     p = torch.softmax(p.float(), dim=-1).half()
-#     # p = torch.exp(p)
-#     ref_out = torch.matmul(p, v)
-#     ref_out.backward(dout)
-#     ref_dv, v.grad = v.grad.clone(), None
-#     ref_dk, k.grad = k.grad.clone(), None
-#     ref_dq, q.grad = q.grad.clone(), None
-#     # triton implementation
-#     tri_out = attention(q, k, v, causal, sm_scale).half()
-#     tri_out.backward(dout)
-#     tri_dv, v.grad = v.grad.clone(), None
-#     tri_dk, k.grad = k.grad.clone(), None
-#     tri_dq, q.grad = q.grad.clone(), None
-#     # compare
-#     assert torch.allclose(ref_out, tri_out, atol=1e-2, rtol=0)
-#     rtol = 0.0
-#     # Relative tolerance workaround for known hardware limitation of CDNA2 GPU.
-#     # For details see https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
-#     if torch.version.hip is not None and triton.runtime.driver.active.get_current_target().arch == "gfx90a":
-#         rtol = 1e-2
-#     assert torch.allclose(ref_dv, tri_dv, atol=1e-2, rtol=rtol)
-#     assert torch.allclose(ref_dk, tri_dk, atol=1e-2, rtol=rtol)
-#     assert torch.allclose(ref_dq, tri_dq, atol=1e-2, rtol=rtol)

         if fp8_v:
             p = p.to(tl.float8e5)
         else:
+            p = p.to(tl.float32)
         acc = tl.dot(p, v, acc)
         # update m_i and l_i
         m_i = m_ij
                   FP8_OUTPUT: tl.constexpr,  #
                   STAGE: tl.constexpr  #
                   ):
+    dtype = tl.float8e5 if FP8_OUTPUT else tl.float32
     tl.static_assert(BLOCK_N <= HEAD_DIM)
     start_m = tl.program_id(0)
     off_hz = tl.program_id(1)
         do = tl.load(do_ptrs)
         # Compute dV.
         ppT = pT
+        ppT = ppT.to(tl.float32)
         dv += tl.dot(ppT, do)
         # D (= delta) is pre-divided by ds_scale.
         Di = tl.load(D + offs_m)
         # Compute dP and dS.
         dpT = tl.dot(v, tl.trans(do)).to(tl.float32)
         dsT = pT * (dpT - Di[None, :])
+        dsT = dsT.to(tl.float32)
         dk += tl.dot(dsT, tl.trans(qT))
         # Increment pointers.
         curr_m += step_m
         # Compute dP and dS.
         dp = tl.dot(do, vT).to(tl.float32)
         ds = p * (dp - Di[:, None])
+        ds = ds.to(tl.float32)
         # Compute dQ.
         # NOTE: We need to de-scale dq in the end, because kT was pre-scaled.
         dq += tl.dot(ds, tl.trans(kT))
 #     return is_close
+attention = Attention.apply
+DEVICE = "cuda:0"
+import pytest
+@pytest.mark.parametrize("Z, H, N_CTX, HEAD_DIM", [(2, 32, 1024, 64)])
+@pytest.mark.parametrize("causal", [True])
+def test_op(Z, H, N_CTX, HEAD_DIM, causal, dtype=torch.float32):
+    torch.manual_seed(20)
+    q = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
+    k = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
+    v = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
+    sm_scale = 0.5
+    dout = torch.randn_like(q)
+    # reference implementation
+    M = torch.tril(torch.ones((N_CTX, N_CTX), device=DEVICE))
+    p = torch.matmul(q, k.transpose(2, 3)) * sm_scale
+    if causal:
+        p[:, :, M == 0] = float("-inf")
+    p = torch.softmax(p.float(), dim=-1)
+    # p = torch.exp(p)
+    ref_out = torch.matmul(p, v)
+    ref_out.backward(dout)
+    ref_dv, v.grad = v.grad.clone(), None
+    ref_dk, k.grad = k.grad.clone(), None
+    ref_dq, q.grad = q.grad.clone(), None
+    # triton implementation
+    tri_out = attention(q, k, v, causal, sm_scale)
+    tri_out.backward(dout)
+    tri_dv, v.grad = v.grad.clone(), None
+    tri_dk, k.grad = k.grad.clone(), None
+    tri_dq, q.grad = q.grad.clone(), None
+    # compare
+    assert torch.allclose(ref_out, tri_out, atol=1e-2, rtol=0)
+    rtol = 0.0
+    # Relative tolerance workaround for known hardware limitation of CDNA2 GPU.
+    # For details see https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
+    if torch.version.hip is not None and triton.runtime.driver.active.get_current_target().arch == "gfx90a":
+        rtol = 1e-2
+    assert torch.allclose(ref_dv, tri_dv, atol=1e-2, rtol=rtol)
+    assert torch.allclose(ref_dk, tri_dk, atol=1e-2, rtol=rtol)
+    assert torch.allclose(ref_dq, tri_dq, atol=1e-2, rtol=rtol)

torch-ext/triton_llama_attn/.pytest_cache/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Created by pytest automatically.
2	+ *

torch-ext/triton_llama_attn/.pytest_cache/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

torch-ext/triton_llama_attn/.pytest_cache/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

torch-ext/triton_llama_attn/.pytest_cache/v/cache/lastfailed ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

torch-ext/triton_llama_attn/.pytest_cache/v/cache/nodeids ADDED Viewed

	@@ -0,0 +1,3 @@

+[
+  "attn.py::test_op[True-2-32-1024-64]"
+]

torch-ext/triton_llama_attn/.pytest_cache/v/cache/stepwise ADDED Viewed

	@@ -0,0 +1 @@


1	+ []

torch-ext/triton_llama_attn/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (258 Bytes). View file

torch-ext/triton_llama_attn/__pycache__/attn.cpython-310-pytest-8.3.5.pyc ADDED Viewed

Binary file (29.6 kB). View file

torch-ext/triton_llama_attn/attn.py CHANGED Viewed

@@ -144,7 +144,7 @@ def _attn_fwd_inner(acc, l_i, m_i, q,  #
         if fp8_v:
             p = p.to(tl.float8e5)
         else:
-            p = p.to(tl.float16)
         acc = tl.dot(p, v, acc)
         # update m_i and l_i
         m_i = m_ij
@@ -344,7 +344,7 @@ def _attn_fwd_tma(sm_scale, M,  #
                   FP8_OUTPUT: tl.constexpr,  #
                   STAGE: tl.constexpr  #
                   ):
-    dtype = tl.float8e5 if FP8_OUTPUT else tl.float16
     tl.static_assert(BLOCK_N <= HEAD_DIM)
     start_m = tl.program_id(0)
     off_hz = tl.program_id(1)
@@ -447,14 +447,14 @@ def _attn_bwd_dkdv(dk, dv,  #
         do = tl.load(do_ptrs)
         # Compute dV.
         ppT = pT
-        ppT = ppT.to(tl.float16)
         dv += tl.dot(ppT, do)
         # D (= delta) is pre-divided by ds_scale.
         Di = tl.load(D + offs_m)
         # Compute dP and dS.
         dpT = tl.dot(v, tl.trans(do)).to(tl.float32)
         dsT = pT * (dpT - Di[None, :])
-        dsT = dsT.to(tl.float16)
         dk += tl.dot(dsT, tl.trans(qT))
         # Increment pointers.
         curr_m += step_m
@@ -500,7 +500,7 @@ def _attn_bwd_dq(dq, q, K, V,  #
         # Compute dP and dS.
         dp = tl.dot(do, vT).to(tl.float32)
         ds = p * (dp - Di[:, None])
-        ds = ds.to(tl.float16)
         # Compute dQ.
         # NOTE: We need to de-scale dq in the end, because kT was pre-scaled.
         dq += tl.dot(ds, tl.trans(kT))
@@ -967,6 +967,7 @@ def attn_forward_kernel(
     scaling: float,
     causal: bool,
 ):
     return Attention.apply(query, key, value, causal, scaling)
 # def test_llama_attention_output():
@@ -1105,44 +1106,44 @@ def attn_forward_kernel(
 #     return is_close
-# attention = Attention.apply
-# DEVICE = "cuda:0"
-# import pytest
-# @pytest.mark.parametrize("Z, H, N_CTX, HEAD_DIM", [(1, 2, 1024, 64)])
-# @pytest.mark.parametrize("causal", [True])
-# def test_op(Z, H, N_CTX, HEAD_DIM, causal, dtype=torch.float16):
-#     torch.manual_seed(20)
-#     q = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
-#     k = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
-#     v = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
-#     sm_scale = 0.5
-#     dout = torch.randn_like(q)
-#     # reference implementation
-#     M = torch.tril(torch.ones((N_CTX, N_CTX), device=DEVICE))
-#     p = torch.matmul(q, k.transpose(2, 3)) * sm_scale
-#     if causal:
-#         p[:, :, M == 0] = float("-inf")
-#     p = torch.softmax(p.float(), dim=-1).half()
-#     # p = torch.exp(p)
-#     ref_out = torch.matmul(p, v)
-#     ref_out.backward(dout)
-#     ref_dv, v.grad = v.grad.clone(), None
-#     ref_dk, k.grad = k.grad.clone(), None
-#     ref_dq, q.grad = q.grad.clone(), None
-#     # triton implementation
-#     tri_out = attention(q, k, v, causal, sm_scale).half()
-#     tri_out.backward(dout)
-#     tri_dv, v.grad = v.grad.clone(), None
-#     tri_dk, k.grad = k.grad.clone(), None
-#     tri_dq, q.grad = q.grad.clone(), None
-#     # compare
-#     assert torch.allclose(ref_out, tri_out, atol=1e-2, rtol=0)
-#     rtol = 0.0
-#     # Relative tolerance workaround for known hardware limitation of CDNA2 GPU.
-#     # For details see https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
-#     if torch.version.hip is not None and triton.runtime.driver.active.get_current_target().arch == "gfx90a":
-#         rtol = 1e-2
-#     assert torch.allclose(ref_dv, tri_dv, atol=1e-2, rtol=rtol)
-#     assert torch.allclose(ref_dk, tri_dk, atol=1e-2, rtol=rtol)
-#     assert torch.allclose(ref_dq, tri_dq, atol=1e-2, rtol=rtol)

         if fp8_v:
             p = p.to(tl.float8e5)
         else:
+            p = p.to(tl.float32)
         acc = tl.dot(p, v, acc)
         # update m_i and l_i
         m_i = m_ij
                   FP8_OUTPUT: tl.constexpr,  #
                   STAGE: tl.constexpr  #
                   ):
+    dtype = tl.float8e5 if FP8_OUTPUT else tl.float32
     tl.static_assert(BLOCK_N <= HEAD_DIM)
     start_m = tl.program_id(0)
     off_hz = tl.program_id(1)
         do = tl.load(do_ptrs)
         # Compute dV.
         ppT = pT
+        ppT = ppT.to(tl.float32)
         dv += tl.dot(ppT, do)
         # D (= delta) is pre-divided by ds_scale.
         Di = tl.load(D + offs_m)
         # Compute dP and dS.
         dpT = tl.dot(v, tl.trans(do)).to(tl.float32)
         dsT = pT * (dpT - Di[None, :])
+        dsT = dsT.to(tl.float32)
         dk += tl.dot(dsT, tl.trans(qT))
         # Increment pointers.
         curr_m += step_m
         # Compute dP and dS.
         dp = tl.dot(do, vT).to(tl.float32)
         ds = p * (dp - Di[:, None])
+        ds = ds.to(tl.float32)
         # Compute dQ.
         # NOTE: We need to de-scale dq in the end, because kT was pre-scaled.
         dq += tl.dot(ds, tl.trans(kT))
     scaling: float,
     causal: bool,
 ):
+    print("######################### attn_forward_kernel", query.shape, key.shape, value.shape, scaling, causal)
     return Attention.apply(query, key, value, causal, scaling)
 # def test_llama_attention_output():
 #     return is_close
+attention = Attention.apply
+DEVICE = "cuda:0"
+import pytest
+@pytest.mark.parametrize("Z, H, N_CTX, HEAD_DIM", [(2, 32, 1024, 64)])
+@pytest.mark.parametrize("causal", [True])
+def test_op(Z, H, N_CTX, HEAD_DIM, causal, dtype=torch.float32):
+    torch.manual_seed(20)
+    q = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
+    k = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
+    v = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
+    sm_scale = 0.5
+    dout = torch.randn_like(q)
+    # reference implementation
+    M = torch.tril(torch.ones((N_CTX, N_CTX), device=DEVICE))
+    p = torch.matmul(q, k.transpose(2, 3)) * sm_scale
+    if causal:
+        p[:, :, M == 0] = float("-inf")
+    p = torch.softmax(p.float(), dim=-1)
+    # p = torch.exp(p)
+    ref_out = torch.matmul(p, v)
+    ref_out.backward(dout)
+    ref_dv, v.grad = v.grad.clone(), None
+    ref_dk, k.grad = k.grad.clone(), None
+    ref_dq, q.grad = q.grad.clone(), None
+    # triton implementation
+    tri_out = attention(q, k, v, causal, sm_scale)
+    tri_out.backward(dout)
+    tri_dv, v.grad = v.grad.clone(), None
+    tri_dk, k.grad = k.grad.clone(), None
+    tri_dq, q.grad = q.grad.clone(), None
+    # compare
+    assert torch.allclose(ref_out, tri_out, atol=1e-2, rtol=0)
+    rtol = 0.0
+    # Relative tolerance workaround for known hardware limitation of CDNA2 GPU.
+    # For details see https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
+    if torch.version.hip is not None and triton.runtime.driver.active.get_current_target().arch == "gfx90a":
+        rtol = 1e-2
+    assert torch.allclose(ref_dv, tri_dv, atol=1e-2, rtol=rtol)
+    assert torch.allclose(ref_dk, tri_dk, atol=1e-2, rtol=rtol)
+    assert torch.allclose(ref_dq, tri_dq, atol=1e-2, rtol=rtol)