zaydzuhri commited on Apr 19

Commit

4135502

verified ·

1 Parent(s): 0298ad2

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fla/__pycache__/__init__.cpython-311.pyc +0 -0
fla/__pycache__/utils.cpython-311.pyc +0 -0
fla/layers/__init__.py +44 -0
fla/layers/__pycache__/__init__.cpython-311.pyc +0 -0
fla/layers/__pycache__/abc.cpython-311.pyc +0 -0
fla/layers/__pycache__/attn.cpython-311.pyc +0 -0
fla/layers/__pycache__/based.cpython-311.pyc +0 -0
fla/layers/__pycache__/bitattn.cpython-311.pyc +0 -0
fla/layers/__pycache__/delta_net.cpython-311.pyc +0 -0
fla/layers/__pycache__/forgetting_attn.cpython-311.pyc +0 -0
fla/layers/__pycache__/gated_deltanet.cpython-311.pyc +0 -0
fla/layers/__pycache__/gated_deltaproduct.cpython-311.pyc +0 -0
fla/layers/__pycache__/gla.cpython-311.pyc +0 -0
fla/layers/__pycache__/gsa.cpython-311.pyc +0 -0
fla/layers/__pycache__/hgrn.cpython-311.pyc +0 -0
fla/layers/__pycache__/hgrn2.cpython-311.pyc +0 -0
fla/layers/__pycache__/lightnet.cpython-311.pyc +0 -0
fla/layers/__pycache__/linear_attn.cpython-311.pyc +0 -0
fla/layers/__pycache__/multiscale_retention.cpython-311.pyc +0 -0
fla/layers/__pycache__/nsa.cpython-311.pyc +0 -0
fla/layers/__pycache__/rebased.cpython-311.pyc +0 -0
fla/layers/__pycache__/rwkv6.cpython-311.pyc +0 -0
fla/layers/__pycache__/rwkv7.cpython-311.pyc +0 -0
fla/layers/abc.py +218 -0
fla/layers/attn.py +222 -0
fla/layers/based.py +96 -0
fla/layers/bitattn.py +192 -0
fla/layers/delta_net.py +291 -0
fla/layers/forgetting_attn.py +109 -0
fla/layers/gated_deltanet.py +293 -0
fla/layers/gated_deltaproduct.py +351 -0
fla/layers/gsa.py +227 -0
fla/layers/hgrn.py +168 -0
fla/layers/hgrn2.py +211 -0
fla/layers/lightnet.py +210 -0
fla/layers/linear_attn.py +166 -0
fla/layers/multiscale_retention.py +298 -0
fla/layers/nsa.py +138 -0
fla/layers/rebased.py +133 -0
fla/layers/rwkv6.py +307 -0
fla/layers/simple_gla.py +261 -0
fla/ops/__init__.py +46 -0
fla/ops/attn/__init__.py +17 -0
fla/ops/attn/__pycache__/__init__.cpython-311.pyc +0 -0
fla/ops/attn/__pycache__/naive.cpython-311.pyc +0 -0
fla/ops/attn/__pycache__/naive_rectified.cpython-311.pyc +0 -0
fla/ops/attn/__pycache__/parallel.cpython-311.pyc +0 -0
fla/ops/attn/naive.py +28 -0
fla/ops/attn/naive_rectified.py +30 -0
fla/ops/attn/naive_softpick.py +39 -0

fla/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.33 kB). View file

fla/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (13.8 kB). View file

fla/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from .abc import ABCAttention
+from .attn import Attention
+from .based import BasedLinearAttention
+from .bitattn import BitAttention
+from .delta_net import DeltaNet
+from .forgetting_attn import ForgettingAttention
+from .gated_deltanet import GatedDeltaNet
+from .gated_deltaproduct import GatedDeltaProduct
+from .gla import GatedLinearAttention
+from .gsa import GatedSlotAttention
+from .hgrn import HGRNAttention
+from .hgrn2 import HGRN2Attention
+from .lightnet import LightNetAttention
+from .linear_attn import LinearAttention
+from .multiscale_retention import MultiScaleRetention
+from .nsa import NativeSparseAttention
+from .rebased import ReBasedLinearAttention
+from .rwkv6 import RWKV6Attention
+from .rwkv7 import RWKV7Attention
+__all__ = [
+    'ABCAttention',
+    'Attention',
+    'BasedLinearAttention',
+    'BitAttention',
+    'DeltaNet',
+    'ForgettingAttention',
+    'GatedDeltaNet',
+    'GatedDeltaProduct',
+    'GatedLinearAttention',
+    'GatedSlotAttention',
+    'HGRNAttention',
+    'HGRN2Attention',
+    'LightNetAttention',
+    'LinearAttention',
+    'MultiScaleRetention',
+    'NativeSparseAttention',
+    'ReBasedLinearAttention',
+    'RWKV6Attention',
+    'RWKV7Attention',
+]

fla/layers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.5 kB). View file

fla/layers/__pycache__/abc.cpython-311.pyc ADDED Viewed

Binary file (9.78 kB). View file

fla/layers/__pycache__/attn.cpython-311.pyc ADDED Viewed

Binary file (11.5 kB). View file

fla/layers/__pycache__/based.cpython-311.pyc ADDED Viewed

Binary file (6.91 kB). View file

fla/layers/__pycache__/bitattn.cpython-311.pyc ADDED Viewed

Binary file (9.62 kB). View file

fla/layers/__pycache__/delta_net.cpython-311.pyc ADDED Viewed

Binary file (13.1 kB). View file

fla/layers/__pycache__/forgetting_attn.cpython-311.pyc ADDED Viewed

Binary file (5.47 kB). View file

fla/layers/__pycache__/gated_deltanet.cpython-311.pyc ADDED Viewed

Binary file (13.9 kB). View file

fla/layers/__pycache__/gated_deltaproduct.cpython-311.pyc ADDED Viewed

Binary file (16.3 kB). View file

fla/layers/__pycache__/gla.cpython-311.pyc ADDED Viewed

Binary file (13.7 kB). View file

fla/layers/__pycache__/gsa.cpython-311.pyc ADDED Viewed

Binary file (10.3 kB). View file

fla/layers/__pycache__/hgrn.cpython-311.pyc ADDED Viewed

Binary file (7.23 kB). View file

fla/layers/__pycache__/hgrn2.cpython-311.pyc ADDED Viewed

Binary file (9.09 kB). View file

fla/layers/__pycache__/lightnet.cpython-311.pyc ADDED Viewed

Binary file (9.33 kB). View file

fla/layers/__pycache__/linear_attn.cpython-311.pyc ADDED Viewed

Binary file (7.97 kB). View file

fla/layers/__pycache__/multiscale_retention.cpython-311.pyc ADDED Viewed

Binary file (13 kB). View file

fla/layers/__pycache__/nsa.cpython-311.pyc ADDED Viewed

Binary file (6.73 kB). View file

fla/layers/__pycache__/rebased.cpython-311.pyc ADDED Viewed

Binary file (7.18 kB). View file

fla/layers/__pycache__/rwkv6.cpython-311.pyc ADDED Viewed

Binary file (15.6 kB). View file

fla/layers/__pycache__/rwkv7.cpython-311.pyc ADDED Viewed

Binary file (11 kB). View file

fla/layers/abc.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from __future__ import annotations
+import warnings
+from typing import TYPE_CHECKING, Optional, Tuple
+import torch
+import torch.nn as nn
+from einops import rearrange
+from fla.modules import FusedRMSNormGated, RMSNorm, RotaryEmbedding, ShortConvolution
+from fla.modules.activations import swiglu, swish
+from fla.ops.abc.chunk import chunk_abc
+if TYPE_CHECKING:
+    from fla.models.utils import Cache
+class ABCAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        expand_k: float = 0.5,
+        expand_v: float = 1.0,
+        num_heads: int = 4,
+        use_short_conv: bool = False,
+        conv_size: int = 4,
+        conv_bias: bool = False,
+        num_slots: Optional[int] = None,
+        elementwise_affine: Optional[bool] = True,
+        norm_eps: float = 1e-5,
+        gate_low_rank_dim: int = 16,
+        gate_logit_normalizer: int = 16,
+        use_rope: bool = True,
+        use_input_gate: bool = False,
+        use_output_gate: bool = True,
+        use_norm: bool = True,
+        clamp_min: Optional[float] = -32,
+        clamp_max: Optional[float] = 32,
+        layer_idx: Optional[int] = None,
+        **kwargs
+    ) -> ABCAttention:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.expand_k = expand_k
+        self.expand_v = expand_v
+        self.num_heads = num_heads
+        self.key_dim = int(self.hidden_size * self.expand_k)
+        self.value_dim = int(self.hidden_size * self.expand_v)
+        self.head_k_dim = self.key_dim // self.num_heads
+        self.head_v_dim = self.value_dim // self.num_heads
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.conv_bias = conv_bias
+        self.gate_low_rank_dim = gate_low_rank_dim
+        self.gate_logit_normalizer = gate_logit_normalizer
+        self.use_rope = use_rope
+        self.use_input_gate = use_input_gate
+        self.use_output_gate = use_output_gate
+        self.use_norm = use_norm
+        if num_slots is None:
+            num_slots = self.head_k_dim
+        self.num_slots = num_slots
+        self.norm_eps = norm_eps
+        self.clamp_min = clamp_min
+        self.clamp_max = clamp_max
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            warnings.warn(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.key_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.key_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.value_dim, bias=False)
+        if use_output_gate:
+            self.g_proj = nn.Linear(self.hidden_size, self.value_dim, bias=False)
+        self.s_proj = nn.Linear(self.hidden_size, self.num_heads * self.num_slots, bias=False)
+        self.o_proj = nn.Linear(self.value_dim, self.hidden_size, bias=False)
+        if use_short_conv:
+            self.conv_size = conv_size
+            self.q_conv1d = ShortConvolution(self.key_dim, conv_size, activation='silu')
+            self.k_conv1d = ShortConvolution(self.key_dim, conv_size, activation='silu')
+            self.v_conv1d = ShortConvolution(self.value_dim, conv_size, activation='silu')
+        if self.use_norm:
+            if self.use_output_gate:
+                self.g_norm = FusedRMSNormGated(
+                    hidden_size=self.head_v_dim,
+                    elementwise_affine=elementwise_affine,
+                    eps=norm_eps
+                )
+            else:
+                self.g_norm = RMSNorm(
+                    hidden_size=self.head_v_dim,
+                    elementwise_affine=elementwise_affine,
+                    eps=norm_eps
+                )
+        if self.use_rope:
+            self.rotary = RotaryEmbedding(self.head_k_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        last_state = None
+        if past_key_values is not None and len(past_key_values) > self.layer_idx:
+            last_state = past_key_values[self.layer_idx]
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        if cu_seqlens is not None:
+            raise NotImplementedError("Training with cu_seqlens is not supported yet for ABCAttention")
+        if self.use_short_conv:
+            conv_state_q, conv_state_k, conv_state_v = None, None, None
+            if last_state is not None:
+                conv_state_q, conv_state_k, conv_state_v = last_state['conv_state']
+            conv_mask = attention_mask[:, -hidden_states.shape[1]:] if attention_mask is not None else None
+            q, conv_state_q = self.q_conv1d(
+                x=self.q_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_q,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            k, conv_state_k = self.k_conv1d(
+                x=self.k_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_k,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            v, conv_state_v = self.v_conv1d(
+                x=self.v_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_v,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+        else:
+            q = self.q_proj(hidden_states)
+            k = self.k_proj(hidden_states)
+            v = self.v_proj(hidden_states)
+        if self.use_input_gate:
+            q, k, v = map(lambda x: swish(x), (q, k, v))
+        # dealing with left-padding
+        if attention_mask is not None:
+            v = v.mul_(attention_mask[:, -v.shape[-2]:, None])
+        q, k = map(lambda x: rearrange(x, '... (h d) -> ... h d', d=self.head_k_dim), (q, k))
+        v = rearrange(v, '... (h d) -> ... h d', d=self.head_v_dim)
+        if self.use_rope:
+            seqlen_offset = 0
+            if past_key_values is not None:
+                seqlen_offset = past_key_values.get_seq_length(self.layer_idx)
+            q, k = self.rotary(q, k, seqlen_offset=seqlen_offset)
+        s = rearrange(self.s_proj(hidden_states), '... (h m) -> ... h m', m=self.num_slots)
+        s = s.clamp_(self.clamp_min, self.clamp_max)
+        recurrent_state = last_state['recurrent_state'] if last_state is not None else None
+        o, recurrent_state = chunk_abc(
+            q=q,
+            k=k,
+            v=v,
+            s=s,
+            initial_state=recurrent_state,
+            output_final_state=use_cache,
+            head_first=False
+        )
+        if past_key_values is not None:
+            past_key_values.update(
+                recurrent_state=recurrent_state,
+                conv_state=(conv_state_q, conv_state_k, conv_state_v) if self.use_short_conv else None,
+                layer_idx=self.layer_idx,
+                offset=q.shape[1]
+            )
+        if self.use_norm and not self.use_output_gate:
+            o = self.g_norm(o)
+        elif self.use_output_gate:
+            g = rearrange(self.g_proj(hidden_states), '... (h d) -> ... h d', d=self.head_v_dim)
+            o = self.g_norm(o, g) if self.use_norm else swiglu(g, o)
+        o = rearrange(o, '... h d -> ... (h d)')
+        o = self.o_proj(o)
+        return o, None, past_key_values
+    def state_size(self, seq_len: int = 2048):
+        return 2 * self.num_slots * self.hidden_size

fla/layers/attn.py ADDED Viewed

	@@ -0,0 +1,222 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from __future__ import annotations
+import warnings
+from typing import TYPE_CHECKING, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from transformers.utils import logging
+from fla.modules import RMSNorm, RotaryEmbedding
+from fla.ops import parallel_attn, parallel_rectified_attn, parallel_softpick_attn, naive_attn, naive_rectified_attn, naive_softpick_attn
+if TYPE_CHECKING:
+    from fla.models.utils import Cache
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
+except ImportError:
+    warnings.warn(
+        "Flash Attention is not installed. Please install it via `pip install flash-attn --no-build-isolation`",
+        category=ImportWarning
+    )
+    flash_attn_func = None
+logger = logging.get_logger(__name__)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        num_heads: int = 32,
+        num_kv_heads: Optional[int] = None,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        window_size: Optional[int] = None,
+        rope_theta: Optional[float] = 10000.,
+        max_position_embeddings: Optional[int] = None,
+        layer_idx: int = None,
+        attn_impl: str = "flash_attn",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        if num_kv_heads is None:
+            self.num_kv_heads = self.num_heads
+        else:
+            self.num_kv_heads = num_kv_heads
+        self.num_kv_groups = num_heads // self.num_kv_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.kv_dim = self.num_kv_heads * self.head_dim
+        self.qkv_bias = qkv_bias
+        self.qk_norm = qk_norm
+        self.window_size = window_size
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_idx = layer_idx
+        self.attn_impl = attn_impl
+        self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=self.qkv_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=self.qkv_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=self.qkv_bias)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        if qk_norm:
+            self.q_norm = RMSNorm(self.head_dim)
+            self.k_norm = RMSNorm(self.head_dim)
+        self.rotary = RotaryEmbedding(dim=self.head_dim, base=self.rope_theta)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        batch_size, q_len, _ = hidden_states.size()
+        q, k, v = self.q_proj(hidden_states), self.k_proj(hidden_states), self.v_proj(hidden_states)
+        q = rearrange(q, '... (h d) -> ... h d', d=self.head_dim)
+        k = rearrange(k, '... (h d) -> ... h d', d=self.head_dim)
+        v = rearrange(v, '... (h d) -> ... h d', d=self.head_dim)
+        if self.qk_norm:
+            q, k = self.q_norm(q), self.k_norm(k)
+        # equivalent to cu_seqlens in `flash_attn`
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        seqlen_offset, max_seqlen = 0, q_len
+        if past_key_values is not None:
+            seqlen_offset = past_key_values.get_seq_length(self.layer_idx)
+            max_seqlen = q.shape[1] + seqlen_offset
+            if attention_mask is not None:
+                # to deliminate the offsets of padding tokens
+                seqlen_offset = seqlen_offset + attention_mask.sum(-1) - attention_mask.shape[-1]
+                max_seqlen = q.shape[1] + max(seqlen_offset)
+        if self.max_position_embeddings is not None:
+            max_seqlen = max(max_seqlen, self.max_position_embeddings)
+        q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)
+        if past_key_values is not None:
+            cache_has_content = past_key_values.get_seq_length(self.layer_idx) > 0
+            k_cached, v_cached = past_key_values.update(
+                attn_state=(k.flatten(-2, -1), v.flatten(-2, -1)),
+                layer_idx=self.layer_idx,
+                offset=q_len,
+                cache_kwargs=dict(window_size=self.window_size)
+            )['attn_state']
+            if cache_has_content:
+                k, v = k_cached, v_cached
+                k = rearrange(k, '... (h d) -> ... h d', d=self.head_dim)
+                v = rearrange(v, '... (h d) -> ... h d', d=self.head_dim)
+        if flash_attn_func is None:
+            raise ImportError("Please install Flash Attention via `pip install flash-attn --no-build-isolation` first")
+        # Contains at least one padding token in the sequence
+        if self.attn_impl == "flash_attn":
+            if attention_mask is not None:
+                q, k, v, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(q, k, v, attention_mask, q_len)
+                cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+                max_seqlen_q, max_seqlen_k = max_seq_lens
+                o = flash_attn_varlen_func(
+                    q, k, v,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_q,
+                    max_seqlen_k=max_seqlen_k,
+                    causal=True,
+                    window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0)
+                )
+                o = pad_input(o, indices_q, batch_size, q_len)
+            elif cu_seqlens is not None:
+                o = flash_attn_varlen_func(
+                    q.squeeze(0), k.squeeze(0), v.squeeze(0),
+                    cu_seqlens_q=cu_seqlens,
+                    cu_seqlens_k=cu_seqlens,
+                    max_seqlen_q=max_seqlen,
+                    max_seqlen_k=max_seqlen,
+                    causal=True,
+                    window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0)
+                ).unsqueeze(0)
+            else:
+                o = flash_attn_func(
+                    q, k, v,
+                    causal=True,
+                    window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0)
+                )
+        elif self.attn_impl == "parallel_attn":
+            o = parallel_attn(q, k, v, scale=self.head_dim**-0.5, cu_seqlens=cu_seqlens)
+        elif self.attn_impl == "parallel_rectified_attn":
+            o = parallel_rectified_attn(q, k, v, scale=self.head_dim**-0.5, cu_seqlens=cu_seqlens)
+        elif self.attn_impl == "parallel_softpick_attn":
+            o = parallel_softpick_attn(q, k, v, scale=self.head_dim**-0.5, cu_seqlens=cu_seqlens)
+        elif self.attn_impl == "naive_attn":
+            o, attentions = naive_attn(q, k, v, scale=self.head_dim**-0.5, cu_seqlens=cu_seqlens)
+        elif self.attn_impl == "naive_rectified_attn":
+            o, attentions = naive_rectified_attn(q, k, v, scale=self.head_dim**-0.5, cu_seqlens=cu_seqlens)
+        elif self.attn_impl == "naive_softpick_attn":
+            o, attentions = naive_softpick_attn(q, k, v, scale=self.head_dim**-0.5, cu_seqlens=cu_seqlens)
+        else:
+            raise ValueError(f"Unknown attention implementation: {self.attn_impl}")
+        o = o.reshape(batch_size, q_len, -1)
+        o = self.o_proj(o)
+        if not output_attentions or "parallel" in self.attn_impl or "flash" in self.attn_impl:
+            attentions = None
+        return o, attentions, past_key_values
+    def _upad_input(self, q, k, v, attention_mask, q_len):
+        batch_size, seq_len, num_key_value_heads, head_dim = k.shape
+        cache_mask = attention_mask[:, -seq_len:]
+        seqlens = cache_mask.sum(-1, dtype=torch.int32)
+        indices_k = torch.nonzero(cache_mask.flatten(), as_tuple=False).flatten()
+        max_seqlen_k = seqlens.max().item()
+        cu_seqlens_k = F.pad(torch.cumsum(seqlens, dim=0, dtype=torch.int32), (1, 0))
+        k = index_first_axis(k.reshape(batch_size * seq_len, num_key_value_heads, head_dim), indices_k)
+        v = index_first_axis(v.reshape(batch_size * seq_len, num_key_value_heads, head_dim), indices_k)
+        if q_len == seq_len:
+            q = index_first_axis(q.reshape(batch_size * seq_len, self.num_heads, head_dim), indices_k)
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_q = max_seqlen_k
+            indices_q = indices_k
+        elif q_len == 1:
+            max_seqlen_q = 1
+            # There is a memcpy here, that is very bad.
+            cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32, device=q.device)
+            indices_q = cu_seqlens_q[:-1]
+            q = q.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -q_len:]
+            q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, attention_mask)
+        return q, k, v, indices_q, (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k)

fla/layers/based.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+"""
+Linear attention in Based.
+https://github.com/HazyResearch/zoology/blob/main/zoology/mixers/based.py
+"""
+import torch
+import torch.nn as nn
+from einops import rearrange
+from fla.modules.feature_map import TaylorFeatureMap
+from fla.ops.based import parallel_based
+from fla.ops.linear_attn import chunk_linear_attn, fused_chunk_linear_attn
+class BasedLinearAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        feature_dim: int = 16,
+        num_key_value_heads: int = 12,
+        num_heads: int = 12,
+        feature_name: str = "taylor_exp",
+        eps: float = 1e-12,
+        causal: bool = True,
+        mode: str = "parallel",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.mode = mode
+        self.feature_name = feature_name
+        self.feature_dim = feature_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.num_heads = num_heads
+        self.head_dim = self.hidden_size // self.num_key_value_heads
+        assert self.hidden_size % self.head_dim == 0
+        self.causal = causal
+        self.q_proj = nn.Linear(self.hidden_size, self.feature_dim * self.num_heads, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.feature_dim * self.num_heads, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.dropout = nn.Identity()
+        self.feature_map = TaylorFeatureMap(feature_dim)
+        self.eps = eps
+    def forward(self, hidden_states: torch.Tensor, **kwargs):
+        mode = self.mode
+        q, k, v = self.q_proj(hidden_states), self.k_proj(hidden_states), self.v_proj(hidden_states)
+        q, k, v = map(lambda x: rearrange(x, "... (h d) -> ... h d", d=self.head_dim), [q, k, v])
+        if mode == "fused_chunk":
+            q, k = self.feature_map(q), self.feature_map(k)
+            o, _ = fused_chunk_linear_attn(q, k, v, normalize=True, scale=1, head_first=False)
+        elif mode == 'chunk':
+            q, k = self.feature_map(q), self.feature_map(k)
+            o, _ = chunk_linear_attn(q, k, v, normalize=True, scale=1, head_first=False)
+        elif mode == 'parallel':
+            assert q.shape[-1] <= 128
+            o = parallel_based(q, k, v, scale=1, use_norm=True, head_first=False)
+        o = rearrange(o, 'b t h d -> b t (h d)')
+        o = self.o_proj(o)
+        o = self.dropout(o)
+        return o
+    # https://github.com/HazyResearch/zoology/blob/main/zoology/mixers/based.py#L119
+    def forward_reference(self, hidden_states: torch.Tensor, filters: torch.Tensor = None, *args, **kwargs):
+        """
+        x (torch.Tensor): tensor of shape (b, d, t)
+        y (torch.Tensor): tensor of shape (b, d, t)
+        """
+        # hidden_states = hidden_states.transpose(1, 2)
+        b, t, _ = hidden_states.size()
+        q, k, v = self.q_proj(hidden_states), self.k_proj(hidden_states), self.v_proj(hidden_states)
+        q = q.view(b, t, self.num_heads, self.feature_dim).transpose(1, 2)
+        k = k.view(b, t, self.num_key_value_heads, self.feature_dim).transpose(1, 2)
+        v = v.view(b, t, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # Linear attention
+        q, k = self.feature_map(q), self.feature_map(k)
+        q, k, v = q.unsqueeze(-2), k.unsqueeze(-2), v.unsqueeze(-1)
+        # Compute attention
+        if self.causal:
+            y = ((q * (k * v).cumsum(2)).sum(-1) / ((q * k.cumsum(2)).sum(-1) + self.eps))
+        else:
+            y = ((q * (k * v).sum(2, True)).sum(-1) / ((q * k.sum(2, True)).sum(-1) + self.eps))
+        y = rearrange(y, 'b h t d -> b t (h d)')
+        y = self.o_proj(y.to(hidden_states.dtype))
+        y = self.dropout(y)
+        return y.to(hidden_states.dtype)

fla/layers/bitattn.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from __future__ import annotations
+import warnings
+from typing import TYPE_CHECKING, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from transformers.utils import logging
+from fla.modules import RotaryEmbedding
+from fla.modules.fused_bitlinear import FusedBitLinear
+if TYPE_CHECKING:
+    from fla.models.utils import Cache
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
+except ImportError:
+    warnings.warn(
+        "Flash Attention is not installed. Please install it via `pip install flash-attn --no-build-isolation`",
+        category=ImportWarning
+    )
+    flash_attn_func = None
+logger = logging.get_logger(__name__)
+class BitAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        num_heads: int = 32,
+        num_kv_heads: Optional[int] = None,
+        window_size: Optional[int] = None,
+        rope_theta: Optional[float] = 10000.,
+        max_position_embeddings: Optional[int] = None,
+        norm_eps: float = 1e-5,
+        layer_idx: int = None
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        if num_kv_heads is None:
+            self.num_kv_heads = self.num_heads
+        else:
+            self.num_kv_heads = num_kv_heads
+        self.num_kv_groups = num_heads // self.num_kv_heads
+        self.hidden_size = hidden_size
+        self.head_dim = self.hidden_size // self.num_heads
+        self.kv_dim = self.num_kv_heads * self.head_dim
+        self.kv_dim = self.num_kv_heads * self.head_dim
+        self.window_size = window_size
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_idx = layer_idx
+        self.q_proj = FusedBitLinear(self.hidden_size, self.hidden_size, bias=False)
+        self.k_proj = FusedBitLinear(self.hidden_size, self.kv_dim, bias=False)
+        self.v_proj = FusedBitLinear(self.hidden_size, self.kv_dim, bias=False)
+        self.o_proj = FusedBitLinear(self.hidden_size, self.hidden_size, bias=False)
+        self.rotary = RotaryEmbedding(dim=self.head_dim, base=self.rope_theta)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        batch_size, q_len, _ = hidden_states.size()
+        q = rearrange(self.q_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
+        k = rearrange(self.k_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
+        v = rearrange(self.v_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
+        # equivalent to cu_seqlens in `flash_attn`
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        seqlen_offset, max_seqlen = 0, q_len
+        if past_key_values is not None:
+            seqlen_offset = past_key_values.get_seq_length(self.layer_idx)
+            max_seqlen = q.shape[1] + seqlen_offset
+            if attention_mask is not None:
+                # to deliminate the offsets of padding tokens
+                seqlen_offset = seqlen_offset + attention_mask.sum(-1) - attention_mask.shape[-1]
+                max_seqlen = q.shape[1] + max(seqlen_offset)
+        if self.max_position_embeddings is not None:
+            max_seqlen = max(max_seqlen, self.max_position_embeddings)
+        q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)
+        if past_key_values is not None:
+            cache_has_content = past_key_values.get_seq_length(self.layer_idx) > 0
+            k_cached, v_cached = past_key_values.update(
+                attn_state=(k.flatten(-2, -1), v.flatten(-2, -1)),
+                layer_idx=self.layer_idx,
+                offset=q_len,
+                cache_kwargs=dict(window_size=self.window_size)
+            )['attn_state']
+            if cache_has_content:
+                k, v = k_cached, v_cached
+                k = rearrange(k, '... (h d) -> ... h d', d=self.head_dim)
+                v = rearrange(v, '... (h d) -> ... h d', d=self.head_dim)
+        if flash_attn_func is None:
+            raise ImportError("Please install Flash Attention via `pip install flash-attn --no-build-isolation` first")
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            q, k, v, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(q, k, v, attention_mask, q_len)
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_q, max_seqlen_k = max_seq_lens
+            o = flash_attn_varlen_func(
+                q, k, v,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlen_k,
+                causal=True,
+                window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0)
+            )
+            o = pad_input(o, indices_q, batch_size, q_len)
+        elif cu_seqlens is not None:
+            o = flash_attn_varlen_func(
+                q.squeeze(0), k.squeeze(0), v.squeeze(0),
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+                causal=True,
+                window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0)
+            ).unsqueeze(0)
+        else:
+            o = flash_attn_func(
+                q, k, v,
+                causal=True,
+                window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0)
+            )
+        o = o.reshape(batch_size, q_len, -1)
+        o = self.o_proj(o)
+        if not output_attentions:
+            attentions = None
+        return o, attentions, past_key_values
+    def _upad_input(self, q, k, v, attention_mask, q_len):
+        batch_size, seq_len, num_key_value_heads, head_dim = k.shape
+        cache_mask = attention_mask[:, -seq_len:]
+        seqlens = cache_mask.sum(-1, dtype=torch.int32)
+        indices_k = torch.nonzero(cache_mask.flatten(), as_tuple=False).flatten()
+        max_seqlen_k = seqlens.max().item()
+        cu_seqlens_k = F.pad(torch.cumsum(seqlens, dim=0, dtype=torch.int32), (1, 0))
+        k = index_first_axis(k.reshape(batch_size * seq_len, num_key_value_heads, head_dim), indices_k)
+        v = index_first_axis(v.reshape(batch_size * seq_len, num_key_value_heads, head_dim), indices_k)
+        if q_len == seq_len:
+            q = index_first_axis(q.reshape(batch_size * seq_len, self.num_heads, head_dim), indices_k)
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_q = max_seqlen_k
+            indices_q = indices_k
+        elif q_len == 1:
+            max_seqlen_q = 1
+            # There is a memcpy here, that is very bad.
+            cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32, device=q.device)
+            indices_q = cu_seqlens_q[:-1]
+            q = q.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -q_len:]
+            q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, attention_mask)
+        return q, k, v, indices_q, (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k)

fla/layers/delta_net.py ADDED Viewed

	@@ -0,0 +1,291 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from __future__ import annotations
+from typing import TYPE_CHECKING, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torch.nn import functional as F
+from fla.modules import FusedRMSNormGated, RMSNorm, ShortConvolution
+from fla.ops.delta_rule import chunk_delta_rule, fused_recurrent_delta_rule
+if TYPE_CHECKING:
+    from transformers.processing_utils import Unpack
+    from fla.models.utils import Cache
+def elu_p1(x):
+    return (F.elu(x, 1., False) + 1.).to(x)
+def sum_norm(x):
+    return (x / x.sum(-1, keepdim=True)).to(x)
+class DeltaNet(nn.Module):
+    r"""
+    The layer implementaion for [Parallelizing Linear Transformers with the Delta Rule over Sequence Length](https://arxiv.org/abs/2406.06484).  # noqa:
+    DeltaNet was originally proposed in [Linear Transformers Are Secretly Fast Weight Programmers](https://arxiv.org/abs/2102.11174). # noqa
+    Args:
+        mode (str, Optional):
+            Which DeltaNet kernel to use.
+            Currently available: `chunk`, `fused_recurrent`, and `fused_chunk`.
+            Default: `chunk`.
+        hidden_size (int, Optional):
+            The hidden size of the input. Default: 1024.
+        expand_k (float, Optional):
+            The expansion ratio for the key dim. Default: 1.0.
+        expand_v (float, Optional):
+            The expansion ratio for the value dim. Default: 1.0.
+        num_heads (int, Optional):
+            The number of heads. Default: 4.
+        use_beta (bool, Optional):
+            Whether to use beta. Default: `True`.
+        use_gate (bool, Optional):
+            Whether to use output gate. Default: `False`.
+        use_short_conv (bool, Optional):
+            Whether to use short convolutions. Default: `True`.
+        conv_size (int, Optional):
+            The kernel size of the short convolution, only used when `use_short_conv` is `True`. Default: 4.
+        conv_bias (bool, Optional):
+            Whether to use bias in the short convolution, only used when `use_short_conv` is `True`. Default: `False`.
+        allow_neg_eigval (bool, Optional):
+            Allow negative eigenvalues. Default: `False`. If set to `True`, the beta will be multiplied by 2.
+            See reference: [Unlocking State-Tracking in Linear RNNs Through Negative Eigenvalues](https://arxiv.org/abs/2411.12537)
+        layer_idx (int, Optional):
+            The index of the layer. Default: None.
+        norm_eps (float, Optional):
+            The epsilon value for the layernorm/rmsnorm layer. Default: 1e-5.
+        qk_activation (str, Optional):
+            The activation function for the query and key. Default: `silu`.
+        qk_norm (str, Optional):
+            The normalization method for the query and key. Default: `l2`.
+    """
+    def __init__(
+        self,
+        mode: str = 'chunk',
+        d_model: int = None,
+        hidden_size: int = 1024,
+        expand_k: float = 1.0,
+        expand_v: float = 1.0,
+        num_heads: int = 4,
+        use_beta: bool = True,
+        use_gate: bool = False,
+        use_short_conv: bool = True,
+        conv_size: int = 4,
+        conv_bias: bool = False,
+        allow_neg_eigval: bool = False,
+        layer_idx: int = None,
+        qk_activation: str = 'silu',
+        qk_norm: str = 'l2',
+        norm_eps: float = 1e-5,
+        **kwargs
+    ) -> DeltaNet:
+        super().__init__()
+        self.mode = mode
+        self.qk_activation = qk_activation
+        self.qk_norm = qk_norm
+        assert self.qk_activation in ['silu', 'relu', 'elu', 'identity']
+        assert self.qk_norm in ['l2', 'sum']
+        if d_model is not None:
+            hidden_size = d_model
+        self.hidden_size = hidden_size
+        self.expand_k = expand_k
+        self.expand_v = expand_v
+        self.num_heads = num_heads
+        self.use_gate = use_gate
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.conv_bias = conv_bias
+        self.allow_neg_eigval = allow_neg_eigval
+        self.key_dim = int(hidden_size * expand_k)
+        self.value_dim = int(hidden_size * expand_v)
+        self.head_k_dim = self.key_dim // num_heads
+        self.head_v_dim = self.value_dim // num_heads
+        self.layer_idx = layer_idx
+        self.silu = nn.SiLU()
+        if mode == 'fused_chunk':
+            raise NotImplementedError("fused_chunk_delta_rule is now deprecated. Please use `chunk_delta_rule` instead.")
+        assert mode in ['chunk', 'fused_recurrent'], f"Not suppoerted mode `{mode}`."
+        assert self.key_dim % num_heads == 0, f"key dim must be divisible by num_heads of {num_heads}"
+        assert self.value_dim % num_heads == 0, f"value dim must be divisible by num_heads of {num_heads}"
+        self.q_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.k_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.v_proj = nn.Linear(hidden_size, self.value_dim, bias=False)
+        self.use_beta = use_beta
+        if self.use_beta:
+            self.b_proj = nn.Linear(hidden_size, self.num_heads, bias=False)
+        if use_short_conv:
+            self.conv_size = conv_size
+            self.q_conv1d = ShortConvolution(
+                hidden_size=self.key_dim,
+                kernel_size=conv_size,
+                activation='silu' if qk_activation == 'silu' else None
+            )
+            self.k_conv1d = ShortConvolution(
+                hidden_size=self.key_dim,
+                kernel_size=conv_size,
+                activation='silu' if qk_activation == 'silu' else None
+            )
+            self.v_conv1d = ShortConvolution(
+                hidden_size=self.value_dim,
+                kernel_size=conv_size,
+                activation='silu'
+            )
+        else:
+            raise UserWarning(
+                "ShortConvolution is crucial to the performance. "
+                "Do not turn it off, i.e., setting `use_short_conv=False` unless you know what you are doing."
+            )
+        if use_gate:
+            self.g_proj = nn.Linear(hidden_size, self.value_dim, bias=False)
+            self.o_norm = FusedRMSNormGated(self.head_v_dim, eps=norm_eps)
+        else:
+            self.o_norm = RMSNorm(self.head_v_dim, eps=norm_eps)
+        self.o_proj = nn.Linear(self.value_dim, hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs: Unpack[Dict]
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        # change to inference mode.
+        mode = 'fused_recurrent' if hidden_states.shape[1] <= 64 else self.mode
+        last_state = None
+        if past_key_values is not None and len(past_key_values) > self.layer_idx:
+            last_state = past_key_values[self.layer_idx]
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        if self.use_short_conv:
+            conv_state_q, conv_state_k, conv_state_v = None, None, None
+            if last_state is not None:
+                conv_state_q, conv_state_k, conv_state_v = last_state['conv_state']
+            conv_mask = attention_mask[:, -hidden_states.shape[1]:] if attention_mask is not None else None
+            q, conv_state_q = self.q_conv1d(
+                x=self.q_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_q,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            k, conv_state_k = self.k_conv1d(
+                x=self.k_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_k,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            v, conv_state_v = self.v_conv1d(
+                x=self.v_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_v,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+        else:
+            q = self.q_proj(hidden_states)
+            k = self.k_proj(hidden_states)
+            if self.qk_activation == 'silu':
+                q, k = self.silu(q), self.silu(k)
+            v = self.silu(self.v_proj(hidden_states))
+        q, k = map(lambda x: rearrange(x, '... (h d) -> ... h d', d=self.head_k_dim), (q, k))
+        v = rearrange(v, '... (h d) -> ... h d', d=self.head_v_dim)
+        if self.qk_activation != 'silu':
+            if self.qk_activation == 'relu':
+                q, k = q.relu(), k.relu()
+            elif self.qk_activation == 'elu':
+                q, k = elu_p1(q), elu_p1(k)
+            elif self.qk_activation == 'identity':
+                pass
+            else:
+                raise NotImplementedError
+        if self.qk_norm == 'sum':
+            q = sum_norm(q).to(q)
+            k = sum_norm(k).to(k)
+        if self.use_beta:
+            beta = self.b_proj(hidden_states).sigmoid()
+        else:
+            beta = q.new_ones(q.shape[0], q.shape[1], q.shape[2])
+        if self.allow_neg_eigval:
+            beta = beta * 2.
+        # dealing with padding
+        if attention_mask is not None:
+            beta = beta.mul(attention_mask[:, -beta.shape[-2]:, None])
+        recurrent_state = last_state['recurrent_state'] if last_state is not None else None
+        if mode == 'fused_recurrent':
+            o, recurrent_state = fused_recurrent_delta_rule(
+                q=q,
+                k=k,
+                v=v,
+                beta=beta,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False,
+                use_qk_l2norm_in_kernel=True if self.qk_norm == 'l2' else False
+            )
+        elif mode == 'chunk':
+            o, recurrent_state = chunk_delta_rule(
+                q=q,
+                k=k,
+                v=v,
+                beta=beta,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False,
+                use_qk_l2norm_in_kernel=True if self.qk_norm == 'l2' else False
+            )
+        else:
+            raise NotImplementedError(f"Not supported mode `{mode}`.")
+        if past_key_values is not None:
+            past_key_values.update(
+                recurrent_state=recurrent_state,
+                conv_state=(conv_state_q, conv_state_k, conv_state_v) if self.use_short_conv else None,
+                layer_idx=self.layer_idx,
+                offset=q.shape[1]
+            )
+        if self.use_gate:
+            g = rearrange(self.g_proj(hidden_states), '... (h d) -> ... h d', d=self.head_v_dim)
+            o = self.o_norm(o, g)
+        else:
+            o = self.o_norm(o)
+        o = rearrange(o, 'b t h d -> b t (h d)')
+        o = self.o_proj(o)
+        return o, None, past_key_values

fla/layers/forgetting_attn.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from __future__ import annotations
+from typing import TYPE_CHECKING, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from transformers.utils import logging
+from fla.modules import GroupNorm
+from fla.ops.forgetting_attn.parallel import parallel_forgetting_attn
+if TYPE_CHECKING:
+    from fla.models.utils import Cache
+logger = logging.get_logger(__name__)
+class ForgettingAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        num_heads: int = 32,
+        num_kv_heads: Optional[int] = None,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        window_size: Optional[int] = None,
+        use_output_gate: bool = False,
+        layer_idx: int = None
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        if num_kv_heads is None:
+            self.num_kv_heads = self.num_heads
+        else:
+            self.num_kv_heads = num_kv_heads
+        self.num_kv_groups = num_heads // self.num_kv_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.kv_dim = self.num_kv_heads * self.head_dim
+        self.qkv_bias = qkv_bias
+        self.qk_norm = qk_norm
+        self.window_size = window_size
+        self.use_output_gate = use_output_gate
+        self.layer_idx = layer_idx
+        self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=self.qkv_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=self.qkv_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=self.qkv_bias)
+        self.f_proj = nn.Linear(self.hidden_size, self.num_heads, bias=True)
+        if use_output_gate:
+            self.g_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        if qk_norm:
+            self.q_norm = GroupNorm(
+                num_groups=self.num_heads,
+                hidden_size=self.hidden_size,
+                is_rms_norm=True,
+            )
+            self.k_norm = GroupNorm(
+                num_groups=self.num_kv_heads,
+                hidden_size=self.kv_dim,
+                is_rms_norm=True,
+            )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        q, k, v = self.q_proj(hidden_states), self.k_proj(hidden_states), self.v_proj(hidden_states)
+        f = F.logsigmoid(self.f_proj(hidden_states).float())
+        if self.qk_norm:
+            q, k = self.q_norm(q), self.k_norm(k)
+        q = rearrange(q, '... (h d) -> ... h d', d=self.head_dim)
+        k = rearrange(k, '... (h d) -> ... h d', d=self.head_dim)
+        v = rearrange(v, '... (h d) -> ... h d', d=self.head_dim)
+        o = parallel_forgetting_attn(q, k, v, f, cu_seqlens=cu_seqlens)
+        o = rearrange(o, '... h d -> ... (h d)')
+        if self.use_output_gate:
+            o = self.g_proj(hidden_states).sigmoid() * o
+        o = self.o_proj(o)
+        return o, None, past_key_values

fla/layers/gated_deltanet.py ADDED Viewed

	@@ -0,0 +1,293 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from __future__ import annotations
+import math
+from typing import TYPE_CHECKING, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torch.nn import functional as F
+from fla.modules import FusedRMSNormGated, RMSNorm, ShortConvolution
+from fla.ops.gated_delta_rule import chunk_gated_delta_rule, fused_recurrent_gated_delta_rule
+if TYPE_CHECKING:
+    from transformers.processing_utils import Unpack
+    from fla.models.utils import Cache
+@torch.compile
+def elu_p1(x):
+    return (F.elu(x, 1., False) + 1.).to(x)
+@torch.compile
+def sum_norm(x):
+    return (x / x.sum(-1, keepdim=True)).to(x)
+class GatedDeltaNet(nn.Module):
+    """
+    The layer implementaion for [Gated Delta Networks: Improving Mamba2 with Delta Rule](https://arxiv.org/abs/2412.06464).  # noqa
+    Similar to Mamba2, each layer contains around 6*hidden_size*hidden_size parameters.
+    Parameter alloation when use_gate=True:
+        - 0.75 * hidden_size * hidden_size for the q_proj and k_proj each
+        - 1.5 * hidden_size * hidden_size for the v_proj, g_proj and o_proj each
+        - Others are ignorably small.
+        - In total = 0.75 * 2 + 1.5 * 3 = 6 * hidden_size * hidden_size
+    NOTE: num_heads * head_dim = 0.75 * hidden_size, please make sure to set the correct num_heads and head_dim.
+    Parameter allocation when use_gate=False:
+        - 1 * hidden_size * hidden_size for the q_proj and k_proj each
+        - 2 * hidden_size * hidden_size for the v_proj and o_proj each
+        - Others are ignorably small.
+        - In total = 1 * 2 + 2 * 2 = 6 * hidden_size * hidden_size
+    Args:
+        hidden_size (int, Optional):
+            The hidden size of the input. Default: 2048.
+        expand_v (float, Optional):
+            The expansion ratio for the value dim. Default: 2.0.
+        head_dim (int, Optional):
+            The dimension of each head. Default: 256.
+        num_heads (int, Optional):
+            The number of heads. Default: 4.
+        mode (str, Optional):
+            Which Gated DeltaNet kernel to use.
+            Currently available: `chunk` and `fused_recurrent`.
+            Default: `chunk`.
+        use_beta (bool, Optional):
+            Whether to use beta. Default: `True`.
+        use_gate (bool, Optional):
+            Whether to use output gate. Default: `True`.
+        use_short_conv (bool, Optional):
+            Whether to use short convolutions. Default: `True`.
+        conv_size (int, Optional):
+            The kernel size of the short convolution, only used when `use_short_conv` is `True`. Default: 4.
+        conv_bias (bool, Optional):
+            Whether to use bias in the short convolution, only used when `use_short_conv` is `True`. Default: `False`.
+        layer_idx (int, Optional):
+            The index of the layer. Default: None.
+        norm_eps (float, Optional):
+            The epsilon value for the normalization layer. Default: 1e-5.
+    """
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        expand_v: float = 2,
+        head_dim: int = 256,
+        num_heads: int = 6,
+        mode: str = 'chunk',
+        use_gate: bool = True,
+        use_short_conv: bool = True,
+        conv_size: int = 4,
+        conv_bias: bool = False,
+        layer_idx: int = None,
+        norm_eps: float = 1e-5,
+        **kwargs
+    ) -> GatedDeltaNet:
+        super().__init__()
+        self.mode = mode
+        self.hidden_size = hidden_size
+        self.expand_v = expand_v
+        self.use_gate = use_gate
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.conv_bias = conv_bias
+        self.head_dim = head_dim
+        self.num_heads = num_heads
+        self.key_dim = int(self.num_heads * self.head_dim)
+        self.value_dim = int(self.key_dim * self.expand_v)
+        self.head_k_dim = head_dim
+        self.head_v_dim = int(head_dim * self.expand_v)
+        self.layer_idx = layer_idx
+        # Consistency check: Ensure expand_v produces integer values
+        if not math.isclose(self.key_dim * expand_v, self.value_dim, rel_tol=1e-5):
+            raise ValueError(
+                f"expand_v={expand_v} does not produce an integer value when multiplied by key_dim={self.key_dim}. "
+                f"Resulting value_dim would be {self.key_dim * expand_v}, which is invalid for nn.Linear."
+            )
+        if not math.isclose(head_dim * expand_v, self.head_v_dim, rel_tol=1e-5):
+            raise ValueError(
+                f"expand_v={expand_v} does not produce an integer value when multiplied by head_dim={head_dim}. "
+                f"Resulting head_v_dim would be {head_dim * expand_v}, which is invalid for FusedRMSNormGated."
+            )
+        assert mode in ['chunk', 'fused_recurrent'], f"Not suppoerted mode `{mode}`."
+        self.q_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.k_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.v_proj = nn.Linear(hidden_size, self.value_dim, bias=False)
+        self.a_proj = nn.Linear(hidden_size, self.num_heads, bias=False)
+        self.b_proj = nn.Linear(hidden_size, self.num_heads, bias=False)
+        A = torch.empty(self.num_heads, dtype=torch.float32).uniform_(0, 16)
+        self.A_log = nn.Parameter(torch.log(A))
+        self.A_log._no_weight_decay = True
+        # hard coded for now
+        dt_min = 0.001
+        dt_max = 0.1
+        dt_init_floor = 1e-4
+        dt = torch.exp(
+            torch.rand(self.num_heads) * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        )
+        dt = torch.clamp(dt, min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        self.dt_bias = nn.Parameter(inv_dt)
+        # Just to be explicit. Without this we already don't put wd on dt_bias because of the check
+        # name.endswith("bias") in param_grouping.py
+        self.dt_bias._no_weight_decay = True
+        if use_short_conv:
+            self.conv_size = conv_size
+            self.q_conv1d = ShortConvolution(
+                hidden_size=self.key_dim,
+                kernel_size=conv_size,
+                activation='silu'
+            )
+            self.k_conv1d = ShortConvolution(
+                hidden_size=self.key_dim,
+                kernel_size=conv_size,
+                activation='silu'
+            )
+            self.v_conv1d = ShortConvolution(
+                hidden_size=self.value_dim,
+                kernel_size=conv_size,
+                activation='silu'
+            )
+        else:
+            raise UserWarning(
+                "ShortConvolution is crucial to the performance. "
+                "Do not turn it off, i.e., setting `use_short_conv=False` unless you know what you are doing."
+            )
+        if use_gate:
+            self.g_proj = nn.Linear(hidden_size, self.value_dim, bias=False)
+            self.o_norm = FusedRMSNormGated(self.head_v_dim, eps=norm_eps)
+        else:
+            self.o_norm = RMSNorm(self.head_v_dim, eps=norm_eps)
+        self.o_proj = nn.Linear(self.value_dim, hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs: Unpack[Dict]
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        mode = 'fused_recurrent' if hidden_states.shape[1] <= 64 else self.mode
+        if self.training:
+            assert mode == 'chunk', "Only chunk mode is supported in training."
+        last_state = None
+        if past_key_values is not None and len(past_key_values) > self.layer_idx:
+            last_state = past_key_values[self.layer_idx]
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        if self.use_short_conv:
+            conv_state_q, conv_state_k, conv_state_v = None, None, None
+            if last_state is not None:
+                conv_state_q, conv_state_k, conv_state_v = last_state['conv_state']
+            conv_mask = attention_mask[:, -hidden_states.shape[1]:] if attention_mask is not None else None
+            q, conv_state_q = self.q_conv1d(
+                x=self.q_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_q,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            k, conv_state_k = self.k_conv1d(
+                x=self.k_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_k,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            v, conv_state_v = self.v_conv1d(
+                x=self.v_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_v,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+        else:
+            q = F.silu(self.q_proj(hidden_states))
+            k = F.silu(self.k_proj(hidden_states))
+            v = F.silu(self.v_proj(hidden_states))
+        q, k = map(lambda x: rearrange(x, 'b t (h d) -> b t h d', d=self.head_k_dim), (q, k))
+        v = rearrange(v, 'b t (h d) -> b t h d', d=self.head_v_dim)
+        beta = self.b_proj(hidden_states).sigmoid()
+        g = -self.A_log.float().exp() * F.softplus(self.a_proj(hidden_states).float() + self.dt_bias)
+        # dealing with padding
+        if attention_mask is not None:
+            beta = beta.mul(attention_mask[:, -beta.shape[-2]:, None])
+            g = g.mul(attention_mask[:, -g.shape[-2]:, None])
+        recurrent_state = last_state['recurrent_state'] if last_state is not None else None
+        if mode == 'chunk':
+            o, recurrent_state = chunk_gated_delta_rule(
+                q=q,
+                k=k,
+                v=v,
+                g=g,
+                beta=beta,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False,
+                use_qk_l2norm_in_kernel=True
+            )
+        elif mode == 'fused_recurrent':
+            o, recurrent_state = fused_recurrent_gated_delta_rule(
+                q=q,
+                k=k,
+                v=v,
+                g=g,
+                beta=beta,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False,
+                use_qk_l2norm_in_kernel=True
+            )
+        if past_key_values is not None:
+            past_key_values.update(
+                recurrent_state=recurrent_state,
+                conv_state=(conv_state_q, conv_state_k, conv_state_v) if self.use_short_conv else None,
+                layer_idx=self.layer_idx,
+                offset=q.shape[1]
+            )
+        if self.use_gate:
+            g = rearrange(self.g_proj(hidden_states), '... (h d) -> ... h d', d=self.head_v_dim)
+            o = self.o_norm(o, g)
+        else:
+            o = self.o_norm(o)
+        o = rearrange(o, 'b t h d -> b t (h d)')
+        o = self.o_proj(o)
+        return o, None, past_key_values

fla/layers/gated_deltaproduct.py ADDED Viewed

	@@ -0,0 +1,351 @@

+from __future__ import annotations
+import math
+from typing import TYPE_CHECKING, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from fla.modules import FusedRMSNormSwishGate, RMSNorm, ShortConvolution
+from fla.ops.delta_rule import chunk_delta_rule
+from fla.ops.gated_delta_rule import chunk_gated_delta_rule
+if TYPE_CHECKING:
+    from transformers.processing_utils import Unpack
+    from fla.models.utils import Cache
+def elu_p1(x):
+    return (F.elu(x, 1.0, False) + 1.0).to(x)
+def sum_norm(x):
+    return (x / x.sum(-1, keepdim=True)).to(x)
+def interleave_multiple_sequences(*sequences):
+    """
+    Interleave multiple sequences together.
+    For example, with sequences [A1, A2], [B1, B2], [C1, C2],
+    returns [A1, B1, C1, A2, B2, C2]
+    """
+    if isinstance(sequences[0], (list, tuple)):
+        sequences = sequences[0]
+    if len(sequences) == 1:
+        return sequences[0]
+    # All sequences should have the same shape
+    assert all(s.shape == sequences[0].shape for s in sequences)
+    # Get the original shape
+    batch_size, seq_len, *rest = sequences[0].shape
+    # Stack sequences along a new dimension
+    stacked = torch.stack(sequences, dim=2)
+    # Reshape to interleave
+    reshaped = stacked.view(batch_size, seq_len * len(sequences), *rest)
+    return reshaped
+class GatedDeltaProduct(nn.Module):
+    """
+    Generalized version of GatedDoubleDeltaNet that supports arbitrary number of householder transformations.
+    """
+    def __init__(
+            self,
+            hidden_size: int = 2048,
+            expand_v: float = 2,
+            head_dim: int = 256,
+            num_heads: int = 6,
+            num_householder: int = 2,  # New parameter for number of householder transformations
+            mode: str = "chunk",
+            use_gate: bool = True,
+            use_forget_gate: bool = True,  # when true Gated DeltaProduct, when false DeltaProduct
+            use_short_conv: bool = True,
+            conv_size: int = 4,
+            conv_bias: bool = False,
+            layer_idx: int | None = None,
+            norm_eps: float = 1e-5,
+            allow_neg_eigval: bool = False,  # when true (Gated) DeltaProduct [-1, 1], when false (Gated) DeltaProduct [0, 1]
+            **kwargs,
+    ) -> None:
+        super().__init__()
+        self.mode = mode
+        self.hidden_size = hidden_size
+        self.expand_v = expand_v
+        self.use_gate = use_gate
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.conv_bias = conv_bias
+        self.head_dim = head_dim
+        self.num_heads = num_heads
+        self.num_householder = num_householder
+        self.allow_neg_eigval = allow_neg_eigval
+        self.use_forget_gate = use_forget_gate
+        self.key_dim = self.num_heads * self.head_dim
+        self.value_dim = int(self.key_dim * self.expand_v)
+        self.head_qk_dim = head_dim
+        self.head_v_dim = int(head_dim * self.expand_v)
+        self.layer_idx = layer_idx
+        self.silu = nn.SiLU()
+        assert mode in ["chunk", "fused_recurrent"], f"Not supported mode `{mode}`."
+        # Create multiple projection layers for each householder transformation
+        self.q_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.k_projs = nn.ModuleList(
+            [
+                nn.Linear(hidden_size, self.key_dim, bias=False)
+                for _ in range(num_householder)
+            ]
+        )
+        self.v_projs = nn.ModuleList(
+            [
+                nn.Linear(hidden_size, self.value_dim, bias=False)
+                for _ in range(num_householder)
+            ]
+        )
+        self.b_projs = nn.ModuleList(
+            [
+                nn.Linear(hidden_size, self.num_heads, bias=False)
+                for _ in range(num_householder)
+            ]
+        )
+        if use_short_conv:
+            self.q_conv1ds = nn.ModuleList(
+                [
+                    ShortConvolution(
+                        hidden_size=self.key_dim,
+                        kernel_size=conv_size,
+                        activation="silu",
+                    )
+                    for _ in range(num_householder)
+                ]
+            )
+            self.k_conv1ds = nn.ModuleList(
+                [
+                    ShortConvolution(
+                        hidden_size=self.key_dim,
+                        kernel_size=conv_size,
+                        activation="silu",
+                    )
+                    for _ in range(num_householder)
+                ]
+            )
+            self.v_conv1ds = nn.ModuleList(
+                [
+                    ShortConvolution(
+                        hidden_size=self.value_dim,
+                        kernel_size=conv_size,
+                        activation="silu",
+                    )
+                    for _ in range(num_householder)
+                ]
+            )
+        if self.use_forget_gate:
+            self.a_proj = nn.Linear(hidden_size, self.num_heads, bias=False)
+            A = torch.empty(self.num_heads, dtype=torch.float32).uniform_(0, 16)
+            A_log = torch.log(A)
+            self.A_log = nn.Parameter(A_log)
+            self.A_log._no_weight_decay = True
+            # Initialize dt parameters
+            dt_min = 0.001
+            dt_max = 0.1
+            dt_init_floor = 1e-4
+            dt = torch.exp(
+                torch.rand(self.num_heads) * (math.log(dt_max) - math.log(dt_min))
+                + math.log(dt_min)
+            )
+            dt = torch.clamp(dt, min=dt_init_floor)
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            self.dt_bias = nn.Parameter(inv_dt)
+            self.dt_bias._no_weight_decay = True
+        if use_gate:
+            self.g_proj = nn.Linear(hidden_size, self.value_dim, bias=False)
+            self.o_norm = FusedRMSNormSwishGate(self.head_v_dim, eps=norm_eps)
+        else:
+            self.o_norm = RMSNorm(self.head_v_dim, eps=norm_eps)
+        self.o_proj = nn.Linear(self.value_dim, hidden_size, bias=False)
+        self.k_id = torch.nn.Identity()
+        self.apply(self._initialize_weights)
+    def _initialize_weights(self, module: nn.Module):
+        if getattr(module, "_is_hf_initialized", False):
+            return
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        module._is_hf_initialized = True
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Cache] = None,
+            use_cache: Optional[bool] = False,
+            output_attentions: Optional[bool] = False,
+            **kwargs: Unpack[Dict],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding)."
+            )
+        mode = (
+            "chunk"  # 'fused_recurrent' if hidden_states.shape[1] <= 64 else self.mode
+        )
+        if self.training:
+            assert mode == "chunk", "Only chunk mode is supported in training."
+        last_state = None
+        if past_key_values is not None and len(past_key_values) > self.layer_idx:
+            last_state = past_key_values[self.layer_idx]
+        # Process each householder transformation
+        ks, vs, betas = [], [], []
+        conv_states = []
+        for i in range(self.num_householder):
+            if self.use_short_conv:
+                conv_state_q, conv_state_k, conv_state_v = None, None, None
+                if last_state is not None:
+                    conv_state_q, conv_state_k, conv_state_v = last_state["conv_state"][
+                        i
+                    ]
+                conv_mask = (
+                    attention_mask[:, -hidden_states.shape[1]:]
+                    if attention_mask is not None
+                    else None
+                )
+                k, conv_state_k = self.k_conv1ds[i](
+                    x=self.k_projs[i](hidden_states),
+                    mask=conv_mask,
+                    cache=conv_state_k,
+                    output_final_state=use_cache,
+                )
+                v, conv_state_v = self.v_conv1ds[i](
+                    x=self.v_projs[i](hidden_states),
+                    mask=conv_mask,
+                    cache=conv_state_v,
+                    output_final_state=use_cache,
+                )
+                conv_states.append((conv_state_q, conv_state_k, conv_state_v))
+            else:
+                k = self.silu(self.k_projs[i](hidden_states))
+                v = self.silu(self.v_projs[i](hidden_states))
+            ks.append(k)
+            vs.append(v)
+            beta = self.b_projs[i](
+                hidden_states
+            ).sigmoid()  # bs, sequence_length, num_heads
+            if attention_mask is not None:
+                beta = beta.mul(attention_mask[:, -hidden_states.shape[1]:, None])
+            if self.allow_neg_eigval:
+                beta = beta * 2
+            betas.append(beta)
+        if self.use_short_conv:
+            q, conv_state_q = self.q_conv1ds[0](
+                x=self.q_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_q,
+                output_final_state=use_cache,
+            )
+        else:
+            q = self.silu(self.q_proj(hidden_states))
+        q = interleave_multiple_sequences(
+            [torch.zeros_like(q)] * (self.num_householder - 1) + [q]
+        )
+        # Interleave all sequences
+        k = interleave_multiple_sequences(ks)
+        v = interleave_multiple_sequences(vs)
+        beta = interleave_multiple_sequences(betas)
+        q, k, v = (
+            rearrange(x, "b t (h d) -> b t h d", h=self.num_heads) for x in (q, k, v)
+        )
+        recurrent_state = (
+            last_state["recurrent_state"] if last_state is not None else None
+        )
+        offsets = kwargs.get("offsets")
+        if mode == "chunk":
+            if self.use_forget_gate:
+                g = -self.A_log.float().exp() * F.softplus(
+                    self.a_proj(hidden_states).float() + self.dt_bias
+                )
+                if attention_mask is not None:
+                    g = g.mul(attention_mask[:, -g.shape[-2]:, None])
+                # Interleave g with zeros for non-first transformations
+                g = interleave_multiple_sequences(
+                    [g] + [torch.zeros_like(g)] * (self.num_householder - 1)
+                )
+                o, recurrent_state = chunk_gated_delta_rule(
+                    q=q,
+                    k=k,
+                    v=v,
+                    g=g,
+                    beta=beta,
+                    initial_state=recurrent_state,
+                    output_final_state=use_cache,
+                    cu_seqlens=offsets,
+                    head_first=False,
+                    use_qk_l2norm_in_kernel=True
+                )
+            else:
+                o, recurrent_state = chunk_delta_rule(
+                    q=q,
+                    k=k,
+                    v=v,
+                    beta=beta,
+                    initial_state=recurrent_state,
+                    output_final_state=use_cache,
+                    cu_seqlens=offsets,
+                    head_first=False,
+                    use_qk_l2norm_in_kernel=True
+                )
+        else:
+            raise NotImplementedError(f"Not supported mode `{mode}`.")
+        # Take every nth element for n householder transformations
+        o = o[:, self.num_householder - 1:: self.num_householder, :]
+        if past_key_values is not None:
+            past_key_values.update(
+                recurrent_state=recurrent_state,
+                conv_state=conv_states if self.use_short_conv else None,
+                layer_idx=self.layer_idx,
+                offset=q.shape[2],
+            )
+        if self.use_gate:
+            g = rearrange(
+                self.g_proj(hidden_states),
+                "... (h d) -> ... h d",
+                h=self.num_heads,
+            )
+            o = self.o_norm(o, g)
+        else:
+            o = self.o_norm(o)
+        o = rearrange(o, "b t h d -> b t (h d)")
+        o = self.o_proj(o)
+        return o, None, past_key_values

fla/layers/gsa.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from __future__ import annotations
+import warnings
+from typing import TYPE_CHECKING, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from fla.modules import RMSNorm, ShortConvolution
+from fla.modules.feature_map import ReLUFeatureMap, SwishFeatureMap, T2RFeatureMap
+from fla.modules.layernorm import rms_norm_linear
+from fla.ops.gsa import chunk_gsa, fused_recurrent_gsa
+if TYPE_CHECKING:
+    from transformers.processing_utils import Unpack
+    from fla.models.utils import Cache
+class GatedSlotAttention(nn.Module):
+    def __init__(
+        self,
+        mode: str = 'chunk',
+        hidden_size: int = 1024,
+        expand_k: float = 1.,
+        expand_v: float = 1.,
+        num_heads: int = 4,
+        num_kv_heads: Optional[int] = None,
+        use_short_conv: bool = False,
+        conv_size: int = 4,
+        conv_bias: bool = False,
+        num_slots: Optional[int] = None,
+        elementwise_affine: Optional[bool] = True,
+        norm_eps: float = 1e-5,
+        gate_logit_normalizer: int = 8,
+        feature_map: str = 'swish',
+        use_output_gate: bool = False,
+        use_norm: bool = True,
+        layer_idx: Optional[int] = None,
+        scale: Optional[float] = 1.,
+        **kwargs
+    ) -> GatedSlotAttention:
+        super().__init__()
+        self.mode = mode
+        self.hidden_size = hidden_size
+        self.expand_k = expand_k
+        self.expand_v = expand_v
+        self.num_heads = num_heads
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.num_kv_groups = self.num_heads // self.num_kv_heads
+        self.key_dim = int(hidden_size * expand_k)
+        self.value_dim = int(hidden_size * expand_v)
+        self.key_dim_per_group = self.key_dim // self.num_kv_groups
+        self.value_dim_per_group = self.value_dim // self.num_kv_groups
+        self.head_k_dim = self.key_dim // self.num_heads
+        self.head_v_dim = self.value_dim // self.num_heads
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.conv_bias = conv_bias
+        self.gate_logit_normalizer = gate_logit_normalizer
+        self.use_output_gate = use_output_gate
+        self.use_norm = use_norm
+        self.scale = scale
+        if num_slots is None:
+            num_slots = self.head_k_dim
+        self.num_slots = num_slots
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            warnings.warn(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.register_module('feature_map', None)
+        if feature_map == 'swish':
+            self.feature_map = SwishFeatureMap()
+        elif feature_map == 'relu':
+            self.feature_map = ReLUFeatureMap()
+        elif feature_map == 't2r':
+            self.feature_map = T2RFeatureMap(self.head_k_dim, self.head_k_dim)
+        else:
+            raise NotImplementedError(f"Feature map `{feature_map}` is not supported now.")
+        self.q_proj = nn.Linear(self.hidden_size, self.key_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.key_dim_per_group, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.value_dim_per_group, bias=False)
+        self.f_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.num_slots, bias=False)
+        if use_short_conv:
+            self.conv_size = conv_size
+            self.q_conv1d = ShortConvolution(self.key_dim, conv_size, activation='silu')
+            self.k_conv1d = ShortConvolution(self.key_dim_per_group, conv_size, activation='silu')
+            self.v_conv1d = ShortConvolution(self.value_dim_per_group, conv_size, activation='silu')
+        self.g_norm = RMSNorm(self.hidden_size, elementwise_affine, eps=norm_eps)
+        self.o_proj = nn.Linear(self.value_dim, self.hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs: Unpack[Dict]
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        # launching the triton kernel for just one token will actually be slower
+        mode = 'fused_recurrent' if hidden_states.shape[1] <= 64 else self.mode
+        last_state = None
+        if past_key_values is not None and len(past_key_values) > self.layer_idx:
+            last_state = past_key_values[self.layer_idx]
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        if self.use_short_conv:
+            conv_state_q, conv_state_k, conv_state_v = None, None, None
+            if last_state is not None:
+                conv_state_q, conv_state_k, conv_state_v = last_state['conv_state']
+            conv_mask = attention_mask[:, -hidden_states.shape[1]:] if attention_mask is not None else None
+            q, conv_state_q = self.q_conv1d(
+                x=self.q_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_q,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            k, conv_state_k = self.k_conv1d(
+                x=self.k_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_k,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            v, conv_state_v = self.v_conv1d(
+                x=self.v_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_v,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+        else:
+            q = self.q_proj(hidden_states)
+            k = self.k_proj(hidden_states)
+            v = self.v_proj(hidden_states)
+        f = self.f_proj(hidden_states)
+        q = rearrange(q, 'b t (h d) -> b t h d', d=self.head_k_dim)
+        k = rearrange(k, 'b t (h d) -> b t h d', d=self.head_k_dim)
+        v = rearrange(v, 'b t (h d) -> b t h d', d=self.head_v_dim)
+        f = rearrange(f, 'b t (h m) -> b t h m', m=self.num_slots)
+        if self.feature_map is not None:
+            q, k = map(lambda x: self.feature_map(x), (q, k))
+        v = F.silu(v)
+        f = F.logsigmoid(f) / self.gate_logit_normalizer
+        s = (1 - f.exp()).to(f.dtype)
+        # dealing with left-padding
+        if attention_mask is not None:
+            s = s.mul_(attention_mask[:, -s.shape[1]:, None, None])
+            v = v.mul_(attention_mask[:, -v.shape[1]:, None, None])
+        recurrent_state = last_state['recurrent_state'] if last_state is not None else None
+        if mode == 'fused_recurrent':
+            o, recurrent_state = fused_recurrent_gsa(
+                q=q,
+                k=k,
+                v=v,
+                s=s,
+                g=f,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                scale=self.scale,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        elif mode == 'chunk':
+            o, recurrent_state = chunk_gsa(
+                q=q,
+                k=k,
+                v=v,
+                s=s,
+                g=f,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                scale=self.scale,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        else:
+            raise NotImplementedError(f"Not supported mode `{mode}`.")
+        if past_key_values is not None:
+            past_key_values.update(
+                recurrent_state=recurrent_state,
+                conv_state=(conv_state_q, conv_state_k, conv_state_v) if self.use_short_conv else None,
+                layer_idx=self.layer_idx,
+                offset=q.shape[1]
+            )
+        o = rearrange(o, 'b t h d -> b t (h d)')
+        o = rms_norm_linear(F.silu(o), self.g_norm.weight, self.g_norm.bias, self.o_proj.weight, self.o_proj.bias)
+        return o, None, past_key_values
+    def state_size(self, *args, **kwargs) -> int:
+        return 2 * self.num_slots * self.hidden_size

fla/layers/hgrn.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# "Hierarchically Gated Recurrent Neural Network for Sequence Modeling" [https://arxiv.org/abs/2311.04823]
+from __future__ import annotations
+from typing import TYPE_CHECKING, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fla.modules import FusedRMSNormGated, ShortConvolution
+from fla.modules.activations import swiglu
+from fla.ops.hgrn import chunk_hgrn, fused_recurrent_hgrn
+if TYPE_CHECKING:
+    from transformers.processing_utils import Unpack
+    from fla.models.utils import Cache
+class HGRNAttention(nn.Module):
+    def __init__(
+        self,
+        mode: str = 'chunk',
+        hidden_size: int = 1024,
+        expand_ratio: Optional[int] = 1,
+        use_short_conv: bool = False,
+        conv_size: int = 4,
+        conv_bias: bool = False,
+        elementwise_affine: Optional[bool] = True,
+        norm_eps: float = 1e-5,
+        layer_idx: int = None
+    ) -> HGRNAttention:
+        super().__init__()
+        self.mode = mode
+        self.hidden_size = hidden_size
+        self.expand_ratio = expand_ratio
+        self.input_dim = int(hidden_size * expand_ratio)
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.conv_bias = conv_bias
+        self.layer_idx = layer_idx
+        assert mode in ['chunk', 'fused_recurrent'], f"Not suppoerted mode `{mode}`."
+        self.i_proj = nn.Linear(hidden_size, self.input_dim, bias=False)
+        self.f_proj = nn.Linear(hidden_size, self.input_dim, bias=False)
+        self.g_proj = nn.Linear(hidden_size, self.input_dim, bias=False)
+        if use_short_conv:
+            self.conv_size = conv_size
+            self.q_conv1d = ShortConvolution(self.input_dim, conv_size, activation=None)
+            self.f_conv1d = ShortConvolution(self.input_dim, conv_size, activation=None)
+            self.i_conv1d = ShortConvolution(self.input_dim, conv_size, activation=None)
+        self.g_norm = FusedRMSNormGated(
+            hidden_size=self.input_dim,
+            elementwise_affine=elementwise_affine,
+            eps=norm_eps
+        )
+        self.o_proj = nn.Linear(self.input_dim, hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        lower_bound: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[Dict]
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        # launching the triton kernel for just one token will actually be slower
+        mode = 'fused_recurrent' if not self.training and hidden_states.shape[1] <= 64 else self.mode
+        last_state = None
+        if past_key_values is not None and len(past_key_values) > self.layer_idx:
+            last_state = past_key_values[self.layer_idx]
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        if self.use_short_conv:
+            conv_state_i, conv_state_f = None, None
+            if last_state is not None:
+                conv_state_i, conv_state_f = last_state['conv_state']
+            conv_mask = attention_mask[:, -hidden_states.shape[1]:] if attention_mask is not None else None
+            i, conv_state_i = self.i_conv1d(
+                x=self.i_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_i,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            f, conv_state_f = self.f_conv1d(
+                x=self.f_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_f,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+        else:
+            i = self.i_proj(hidden_states)
+            f = self.f_proj(hidden_states)
+        # the lower bound for the first layer is zero
+        if lower_bound is None or self.layer_idx == 0:
+            i, f = swiglu(i, 1 - f.sigmoid()), F.logsigmoid(f)
+        else:
+            g = lower_bound + (1 - lower_bound) * f.sigmoid()
+            i, f = swiglu(i, 1 - g), g.log()
+        # dealing with left-padding
+        if attention_mask is not None:
+            i = i.mul_(attention_mask[:, -i.shape[-2]:, None])
+        recurrent_state = last_state['recurrent_state'] if last_state is not None else None
+        if mode == 'chunk':
+            if cu_seqlens is not None:
+                raise NotImplementedError("Chunk mode does not support variable-length sequences.")
+            o, recurrent_state = chunk_hgrn(
+                x=i,
+                g=f,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+            )
+        elif mode == 'fused_recurrent':
+            o, recurrent_state = fused_recurrent_hgrn(
+                x=i,
+                g=f,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+        else:
+            raise NotImplementedError(f"Not supported mode `{mode}`.")
+        if past_key_values is not None:
+            past_key_values.update(
+                recurrent_state=recurrent_state,
+                conv_state=(conv_state_i, conv_state_f) if self.use_short_conv else None,
+                layer_idx=self.layer_idx,
+                offset=i.shape[2]
+            )
+        o = self.g_norm(o, self.g_proj(hidden_states))
+        o = self.o_proj(o)
+        return o, None, past_key_values
+    def state_size(self, **kwargs) -> int:
+        state_size = self.hidden_size
+        for module in self.children():
+            if isinstance(module, ShortConvolution):
+                state_size += module.state_size
+        return state_size

fla/layers/hgrn2.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# "HGRN2: Gated Linear RNNs with State Expansion"[https://arxiv.org/abs/2404.07904]
+from __future__ import annotations
+from typing import TYPE_CHECKING, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from fla.modules import RMSNorm, ShortConvolution
+from fla.modules.activations import swish
+from fla.modules.layernorm import rms_norm_linear
+from fla.ops.gla import chunk_gla, fused_chunk_gla, fused_recurrent_gla
+if TYPE_CHECKING:
+    from transformers.processing_utils import Unpack
+    from fla.models.utils import Cache
+class HGRN2Attention(nn.Module):
+    def __init__(
+        self,
+        mode: str = 'chunk',
+        hidden_size: int = 1024,
+        num_heads: Optional[int] = None,
+        expand_ratio: Optional[int] = 128,
+        use_short_conv: bool = False,
+        conv_size: int = 4,
+        conv_bias: bool = False,
+        elementwise_affine: Optional[bool] = True,
+        norm_eps: float = 1e-5,
+        layer_idx: int = None
+    ) -> HGRN2Attention:
+        super().__init__()
+        self.mode = mode
+        self.hidden_size = hidden_size
+        if expand_ratio is None and num_heads is not None:
+            expand_ratio = hidden_size // num_heads
+        elif expand_ratio is not None and num_heads is None:
+            num_heads = hidden_size // expand_ratio
+        elif expand_ratio is None and num_heads is None:
+            raise RuntimeError("One of `expand_ratio` or `num_heads` should be provided.")
+        self.num_heads = num_heads
+        self.expand_ratio = expand_ratio
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.conv_bias = conv_bias
+        self.forget_dim = int(self.num_heads * self.expand_ratio)
+        self.input_dim = hidden_size
+        self.layer_idx = layer_idx
+        assert mode in ['chunk', 'fused_recurrent', 'fused_chunk'], f"Not suppoerted mode `{mode}`."
+        assert self.forget_dim % num_heads == 0, f"forget dim must be divisible by num_heads of {num_heads}"
+        assert self.input_dim % num_heads == 0, f"input dim must be divisible by num_heads of {num_heads}"
+        self.head_f_dim = self.expand_ratio
+        self.head_i_dim = self.hidden_size // num_heads
+        self.q_proj = nn.Linear(hidden_size, self.forget_dim, bias=False)
+        self.f_proj = nn.Linear(hidden_size, self.forget_dim, bias=False)
+        self.i_proj = nn.Linear(hidden_size, self.input_dim, bias=False)
+        if use_short_conv:
+            self.conv_size = conv_size
+            self.q_conv1d = ShortConvolution(self.forget_dim, conv_size, activation=None)
+            self.f_conv1d = ShortConvolution(self.forget_dim, conv_size, activation=None)
+            self.i_conv1d = ShortConvolution(self.input_dim, conv_size, activation=None)
+        self.g_norm = RMSNorm(hidden_size=self.hidden_size, elementwise_affine=elementwise_affine, eps=norm_eps)
+        self.o_proj = nn.Linear(self.input_dim, hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        lower_bound: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[Dict]
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        # launching the triton kernel for just one token will actually be slower
+        mode = 'fused_recurrent' if hidden_states.shape[1] <= 64 else self.mode
+        last_state = None
+        if past_key_values is not None and len(past_key_values) > self.layer_idx:
+            last_state = past_key_values[self.layer_idx]
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        if self.use_short_conv:
+            conv_state_q, conv_state_f, conv_state_i = None, None, None
+            if last_state is not None:
+                conv_state_q, conv_state_f, conv_state_i = last_state['conv_state']
+            conv_mask = attention_mask[:, -hidden_states.shape[1]:] if attention_mask is not None else None
+            q, conv_state_q = self.q_conv1d(
+                x=self.q_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_q,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            f, conv_state_f = self.f_conv1d(
+                x=self.f_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_f,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            i, conv_state_i = self.i_conv1d(
+                x=self.i_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_i,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+        else:
+            q = self.q_proj(hidden_states)
+            f = self.f_proj(hidden_states)
+            i = self.i_proj(hidden_states)
+        # dealing with left-padding
+        if attention_mask is not None:
+            i = i.mul_(attention_mask[:, -i.shape[-2]:, None])
+        q = swish(q)
+        # improve precision
+        f = f.float()
+        # the lower bound for the first layer is zero
+        if lower_bound is None or self.layer_idx == 0:
+            k, g = 1 - f.sigmoid(), F.logsigmoid(f)
+        else:
+            g = lower_bound + (1 - lower_bound) * f.sigmoid()
+            k, g = 1 - g, g.log()
+        q, k, g = map(lambda x: rearrange(x, '... (h d) -> ... h d', d=self.head_f_dim), (q, k.to(i), g))
+        i = rearrange(i, '... (h d) -> ... h d', d=self.head_i_dim)
+        recurrent_state = last_state['recurrent_state'] if last_state is not None else None
+        if mode == 'fused_recurrent':
+            o, recurrent_state = fused_recurrent_gla(
+                q=q,
+                k=k,
+                v=i,
+                gk=g,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        elif mode == 'fused_chunk':
+            o, recurrent_state = fused_chunk_gla(
+                q=q,
+                k=k,
+                v=i,
+                g=g,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                head_first=False
+            )
+        elif mode == 'chunk':
+            o, recurrent_state = chunk_gla(
+                q=q,
+                k=k,
+                v=i,
+                g=g,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        else:
+            raise NotImplementedError(f"Not supported mode `{mode}`.")
+        if past_key_values is not None:
+            past_key_values.update(
+                recurrent_state=recurrent_state,
+                conv_state=(conv_state_q, conv_state_f, conv_state_i) if self.use_short_conv else None,
+                layer_idx=self.layer_idx,
+                offset=q.shape[1]
+            )
+        o = rearrange(o, '... h d -> ... (h d)')
+        o = rms_norm_linear(o, self.g_norm.weight, self.g_norm.bias, self.o_proj.weight, self.o_proj.bias)
+        return o, None, past_key_values
+    def state_size(self, **kwargs) -> int:
+        state_size = self.forget_dim * self.head_i_dim
+        for module in self.children():
+            if isinstance(module, ShortConvolution):
+                state_size += module.state_size
+        return state_size

fla/layers/lightnet.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ["You Only Scan Once: Efficient Multi-dimension Sequential Modeling with LightNet"](https://arxiv.org/abs/2405.21022)
+from __future__ import annotations
+from typing import TYPE_CHECKING, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from fla.modules import FusedRMSNormGated, ShortConvolution
+from fla.modules.fused_norm_gate import rms_norm_swish_gate_linear
+from fla.ops.gla import chunk_gla, fused_recurrent_gla
+if TYPE_CHECKING:
+    from transformers.processing_utils import Unpack
+    from fla.models.utils import Cache
+class LightNetAttention(nn.Module):
+    def __init__(
+        self,
+        mode: str = 'chunk',
+        hidden_size: int = 1024,
+        num_heads: Optional[int] = None,
+        expand_ratio: Optional[int] = 128,
+        use_short_conv: bool = False,
+        conv_size: int = 4,
+        conv_bias: bool = False,
+        gate_low_rank_dim: int = 128,
+        elementwise_affine: Optional[bool] = True,
+        norm_eps: float = 1e-5,
+        layer_idx: int = None
+    ) -> LightNetAttention:
+        super().__init__()
+        self.mode = mode
+        self.hidden_size = hidden_size
+        if expand_ratio is None and num_heads is not None:
+            expand_ratio = hidden_size // num_heads
+        elif expand_ratio is not None and num_heads is None:
+            num_heads = hidden_size // expand_ratio
+        elif expand_ratio is None and num_heads is None:
+            raise RuntimeError("One of `expand_ratio` or `num_heads` should be provided.")
+        self.num_heads = num_heads
+        self.expand_ratio = expand_ratio
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.conv_bias = conv_bias
+        self.key_dim = int(self.num_heads * self.expand_ratio)
+        self.value_dim = hidden_size
+        self.gate_low_rank_dim = gate_low_rank_dim
+        self.layer_idx = layer_idx
+        assert mode in ['chunk', 'fused_chunk'], f"Not suppoerted mode `{mode}`."
+        assert self.key_dim % num_heads == 0, f"key dim must be divisible by num_heads of {num_heads}"
+        assert self.value_dim % num_heads == 0, f"value dim must be divisible by num_heads of {num_heads}"
+        self.head_f_dim = self.expand_ratio
+        self.head_i_dim = self.hidden_size // num_heads
+        self.q_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.k_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.v_proj = nn.Linear(hidden_size, self.value_dim, bias=False)
+        if use_short_conv:
+            self.conv_size = conv_size
+            self.q_conv1d = ShortConvolution(self.key_dim, conv_size, activation=None)
+            self.k_conv1d = ShortConvolution(self.key_dim, conv_size, activation=None)
+            self.v_conv1d = ShortConvolution(self.value_dim, conv_size, activation=None)
+        self.g_proj = nn.Sequential(
+            nn.Linear(hidden_size, gate_low_rank_dim, bias=False),
+            nn.Linear(gate_low_rank_dim, hidden_size, bias=False)
+        )
+        self.g_norm = FusedRMSNormGated(
+            hidden_size=hidden_size,
+            elementwise_affine=elementwise_affine,
+            eps=norm_eps
+        )
+        self.o_proj = nn.Linear(self.value_dim, hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs: Unpack[Dict]
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        # launching the triton kernel for just one token will actually be slower
+        mode = 'fused_recurrent' if hidden_states.shape[1] <= 64 else self.mode
+        last_state = None
+        if past_key_values is not None and len(past_key_values) > self.layer_idx:
+            last_state = past_key_values[self.layer_idx]
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        if self.use_short_conv:
+            conv_state_q, conv_state_k, conv_state_v = None, None, None
+            if last_state is not None:
+                conv_state_q, conv_state_k, conv_state_v = last_state['conv_state']
+            conv_mask = attention_mask[:, -hidden_states.shape[1]:] if attention_mask is not None else None
+            q, conv_state_q = self.q_conv1d(
+                x=self.q_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_q,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            k, conv_state_k = self.k_conv1d(
+                x=self.k_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_k,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            v, conv_state_v = self.v_conv1d(
+                x=self.v_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_v,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+        else:
+            q = self.q_proj(hidden_states)
+            k = self.k_proj(hidden_states)
+            v = self.v_proj(hidden_states)
+        # dealing with left-padding
+        if attention_mask is not None:
+            v = v.mul_(attention_mask[:, -v.shape[-2]:, None])
+        q = F.silu(q)
+        q, k = map(lambda x: rearrange(x, '... (h d) -> ... h d', d=self.head_f_dim), (q, k))
+        v = rearrange(v, '... (h d) -> ... h d', d=self.head_i_dim)
+        # TODO: this 2 steps took huge amount of time, which should be optimized
+        z = k.float().logcumsumexp(1)
+        if cu_seqlens is not None:
+            raise NotImplementedError("LightNet does not support variable-length sequences for now.")
+        k, g = torch.exp(k - z).to(k.dtype), (torch.cat((z[:, :1], z[:, :-1]), 1) - z).to(k.dtype)
+        recurrent_state = last_state['recurrent_state'] if last_state is not None else None
+        if mode == 'fused_recurrent':
+            o, recurrent_state = fused_recurrent_gla(
+                q=q,
+                k=k,
+                v=v,
+                gk=g,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        elif mode == 'chunk':
+            o, recurrent_state = chunk_gla(
+                q=q,
+                k=k,
+                v=v,
+                g=g,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        else:
+            raise NotImplementedError(f"Not supported mode `{mode}`.")
+        if past_key_values is not None:
+            past_key_values.update(
+                recurrent_state=recurrent_state,
+                conv_state=(conv_state_q, conv_state_k, conv_state_v) if self.use_short_conv else None,
+                layer_idx=self.layer_idx,
+                offset=q.shape[1]
+            )
+        o = rms_norm_swish_gate_linear(
+            rearrange(o, 'b t h d -> b t (h d)'),
+            self.g_proj(hidden_states),
+            self.g_norm.weight,
+            self.g_norm.bias,
+            self.o_proj.weight,
+            self.o_proj.bias
+        )
+        return o, None, past_key_values
+    def state_size(self, **kwargs) -> int:
+        state_size = self.key_dim * self.head_i_dim
+        for module in self.children():
+            if isinstance(module, ShortConvolution):
+                state_size += module.state_size
+        return state_size

fla/layers/linear_attn.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from fla.modules import RMSNorm
+from fla.modules.feature_map import DPFPFeatureMap, HadamardFeatureMap, HedgehogFeatureMap, T2RFeatureMap
+from fla.ops.linear_attn import chunk_linear_attn, fused_chunk_linear_attn, fused_recurrent_linear_attn
+class LinearAttention(nn.Module):
+    def __init__(
+        self,
+        mode: str = 'chunk',
+        hidden_size: str = 1024,
+        expand_k: int = 1.0,
+        expand_v: int = 1.0,
+        num_heads: int = 8,
+        num_kv_heads: Optional[int] = None,
+        feature_map: str = 'elementwise_product',
+        tie_feature_map_qk: bool = False,
+        output_norm: str = 'rmsnorm',
+        norm_q: bool = False,
+        norm_k: bool = False,
+        do_feature_map_norm: bool = False,
+        elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.mode = mode
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads if num_kv_heads is not None else num_heads
+        self.num_kv_groups = self.num_heads // self.num_kv_heads
+        self.key_dim = int(hidden_size * expand_k)
+        self.value_dim = int(hidden_size * expand_v)
+        self.key_dim_per_group = self.key_dim // self.num_kv_groups
+        self.value_dim_per_group = self.value_dim // self.num_kv_groups
+        assert mode in ['chunk', 'fused_chunk', 'fused_recurrent'], f"Not suppoerted mode `{mode}`."
+        assert self.key_dim % num_heads == 0, f"key dim must be divisible by num_heads of {num_heads}"
+        assert self.value_dim % num_heads == 0, f"value dim must be divisible by num_heads of {num_heads}"
+        self.head_k_dim = self.key_dim // num_heads
+        self.head_v_dim = self.value_dim // num_heads
+        self.do_feature_map_norm = do_feature_map_norm
+        if feature_map == 'hedgehog':
+            if tie_feature_map_qk:
+                self.feature_map_q = self.feature_map_k = HedgehogFeatureMap(head_dim=self.head_k_dim)
+            else:
+                self.feature_map_q = HedgehogFeatureMap(head_dim=self.head_k_dim)
+                self.feature_map_k = HedgehogFeatureMap(head_dim=self.head_k_dim)
+        elif feature_map == 't2r':
+            if tie_feature_map_qk:
+                self.feature_map_q = self.feature_map_k = T2RFeatureMap(head_dim=self.head_k_dim)
+            else:
+                self.feature_map_q = T2RFeatureMap(head_dim=self.head_k_dim)
+                self.feature_map_k = T2RFeatureMap(head_dim=self.head_k_dim)
+        elif feature_map == 'elementwise_product':
+            if tie_feature_map_qk:
+                self.feature_map_q = self.feature_map_k = HadamardFeatureMap(head_dim=self.head_k_dim)
+            else:
+                self.feature_map_q = HadamardFeatureMap(head_dim=self.head_k_dim)
+                self.feature_map_k = HadamardFeatureMap(head_dim=self.head_k_dim)
+        elif feature_map == 'dpfp':
+            self.feature_map_q = DPFPFeatureMap(head_dim=self.head_k_dim)
+            self.feature_map_k = DPFPFeatureMap(head_dim=self.head_k_dim)
+        elif feature_map == 'elu':
+            def elu(x):
+                return F.elu(x) + 1
+            self.feature_map_q = elu
+            self.feature_map_k = elu
+        elif feature_map == 'relu':
+            self.feature_map_q = nn.ReLU()
+            self.feature_map_k = nn.ReLU()
+        elif feature_map == 'identity':
+            self.feature_map_q = nn.Identity()
+            self.feature_map_k = nn.Identity()
+        else:
+            raise NotImplementedError(f"Not supported feature map `{feature_map}`.")
+        self.q_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.k_proj = nn.Linear(hidden_size, self.key_dim_per_group, bias=False)
+        self.v_proj = nn.Linear(hidden_size, self.value_dim_per_group, bias=False)
+        if output_norm == 'rmsnorm':
+            self.norm = RMSNorm(hidden_size=self.head_v_dim, elementwise_affine=elementwise_affine, eps=norm_eps)
+        elif output_norm == 'identity':
+            self.norm = nn.Identity()
+        else:
+            raise NotImplementedError(f"Not supported output norm `{output_norm}`.")
+        self.o_proj = nn.Linear(self.value_dim, hidden_size, bias=False)
+        self.norm_q = norm_q
+        self.norm_k = norm_k
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        **kwargs
+    ) -> torch.Tensor:
+        mode = self.mode
+        q = self.q_proj(hidden_states)
+        k = self.k_proj(hidden_states)
+        v = self.v_proj(hidden_states)
+        q = rearrange(q, '... (h d) -> ... h d', d=self.head_k_dim)
+        if self.num_kv_groups > 1:
+            k = repeat(k, '... (h d) -> ... (h g) d', d=self.head_k_dim, g=self.num_kv_groups)
+            v = repeat(v, '... (h d) -> ... (h g) d', d=self.head_v_dim, g=self.num_kv_groups)
+        else:
+            k = rearrange(k, '... (h d) -> ... h d', d=self.head_k_dim)
+            v = rearrange(v, '... (h d) -> ... h d', d=self.head_v_dim)
+        q = self.feature_map_q(q)
+        k = self.feature_map_k(k)
+        if self.norm_q:
+            q = q / (q.sum(-1, True) + 1e-4)
+        if self.norm_k:
+            k = k / (k.sum(-1, True) + 1e-4)
+        if mode == 'chunk':
+            o, final_state = chunk_linear_attn(
+                q=q,
+                k=k,
+                v=v,
+                normalize=self.do_feature_map_norm,
+                head_first=False
+            )
+        elif mode == 'fused_chunk':
+            o, final_state = fused_chunk_linear_attn(
+                q=q,
+                k=k,
+                v=v,
+                normalize=self.do_feature_map_norm,
+            )
+        elif mode == 'fused_recurrent':
+            o, final_state = fused_recurrent_linear_attn(
+                q=q,
+                k=k,
+                v=v,
+                normalize=self.do_feature_map_norm,
+            )
+        else:
+            raise NotImplementedError
+        o = self.norm(o)
+        o = rearrange(o, '... h d -> ... (h d)')
+        o = self.o_proj(o)
+        return o

fla/layers/multiscale_retention.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from __future__ import annotations
+from typing import TYPE_CHECKING, Optional, Tuple
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from transformers.activations import ACT2FN
+from fla.modules import FusedRMSNormGated, RMSNorm, ShortConvolution
+from fla.modules.rotary import RotaryEmbedding
+from fla.ops.retention import chunk_retention, fused_chunk_retention, fused_recurrent_retention, parallel_retention
+if TYPE_CHECKING:
+    from fla.models.utils import Cache
+class MultiScaleRetention(nn.Module):
+    r"""
+    The layer implementaion for [Retentive Network: A Successor to Transformer for Large Language Models](https://arxiv.org/pdf/2307.08621.pdf).  # noqa
+    Args:
+        mode (str, Optional):
+            Which Retention kernel to use.
+            Currently available: `chunk`, `fused_recurrent`, `parallel`, and `fused_chunk`.
+            Default: `chunk`.
+        hidden_size (int, Optional):
+            The hidden size of the input. Default: 1024.
+        expand_k (float, Optional):
+            The expansion ratio for the key dim. Default: 1.0.
+        expand_v (float, Optional):
+            The expansion ratio for the value dim. Default: 2.0.
+        num_heads (int, Optional):
+            The number of heads. Default: 8.
+        num_kv_heads (int, Optional):
+            The number of key/value heads, used for MQA. Default: None.
+        feature_map (str, Optional):
+            Feature map function applied to queries/keys. Default: None.
+        use_short_conv (bool, Optional):
+            Whether to use short convolutions. Default: `False`.
+        conv_size (int, Optional):
+            The kernel size of the short convolution, only used when `use_short_conv` is `True`. Default: 4.
+        conv_bias (bool, Optional):
+            Whether to use bias in the short convolution, only used when `use_short_conv` is `True`. Default: `False`.
+        use_output_gate (bool, Optional):
+            Whether to use output gate. Default: `True`.
+        gate_fn (str, Optional):
+            The activation function for the output gate. Default: `swish`.
+        elementwise_affine (bool, Optional):
+            If `True`, applies elementwise affine to LayerNorm with learnable parameters. Default: `True`.
+        norm_eps (float, Optional):
+            The epsilon value for the layernorm/rmsnorm layer. Default: 1e-5.
+        fuse_norm (bool, Optional):
+            Whether to fuse the norm and the output gate for better memory footprint. Default: `True`.
+        layer_idx (int, Optional):
+            The index of the layer. Default: None.
+    """
+    def __init__(
+        self,
+        mode: str = 'chunk',
+        hidden_size: int = 1024,
+        expand_k: float = 1.0,
+        expand_v: float = 2.0,
+        num_heads: int = 8,
+        num_kv_heads: Optional[int] = None,
+        feature_map: Optional[str] = None,
+        use_short_conv: bool = False,
+        conv_size: int = 4,
+        conv_bias: bool = False,
+        use_output_gate: bool = True,
+        gate_fn: str = 'swish',
+        elementwise_affine: Optional[bool] = True,
+        norm_eps: float = 1e-5,
+        fuse_norm: bool = True,
+        layer_idx: int = None,
+        **kwargs
+    ) -> MultiScaleRetention:
+        super().__init__()
+        self.mode = mode
+        self.hidden_size = hidden_size
+        self.expand_k = expand_k
+        self.expand_v = expand_v
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads if num_kv_heads is not None else num_heads
+        self.num_kv_groups = self.num_heads // self.num_kv_heads
+        self.feature_map_fn = ACT2FN[feature_map] if feature_map is not None else None
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.conv_bias = conv_bias
+        self.use_output_gate = use_output_gate
+        self.key_dim = int(hidden_size * expand_k)
+        self.value_dim = int(hidden_size * expand_v)
+        self.key_dim_per_group = self.key_dim // self.num_kv_groups
+        self.value_dim_per_group = self.value_dim // self.num_kv_groups
+        self.layer_idx = layer_idx
+        assert mode in ['chunk', 'fused_chunk', 'parallel', 'fused_recurrent'], f"Not suppoerted mode `{mode}`."
+        assert self.key_dim % num_heads == 0, f"key dim must be divisible by num_heads of {num_heads}"
+        assert self.value_dim % num_heads == 0, f"value dim must be divisible by num_heads of {num_heads}"
+        self.head_k_dim = self.key_dim // num_heads
+        self.head_v_dim = self.value_dim // num_heads
+        self.q_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.k_proj = nn.Linear(hidden_size, self.key_dim_per_group, bias=False)
+        self.v_proj = nn.Linear(hidden_size, self.value_dim_per_group, bias=False)
+        if self.use_output_gate:
+            self.g_proj = nn.Linear(hidden_size, self.value_dim, bias=False)
+        if use_short_conv:
+            self.conv_size = conv_size
+            self.q_conv1d = ShortConvolution(self.key_dim, conv_size, activation='silu')
+            self.k_conv1d = ShortConvolution(self.key_dim_per_group, conv_size, activation='silu')
+            self.v_conv1d = ShortConvolution(self.value_dim_per_group, conv_size, activation='silu')
+        self.o_proj = nn.Linear(self.value_dim, hidden_size, bias=False)
+        if gate_fn == 'swish' and fuse_norm and use_output_gate:
+            self.g_norm_swish_gate = FusedRMSNormGated(
+                hidden_size=self.head_v_dim,
+                elementwise_affine=elementwise_affine,
+                eps=norm_eps
+            )
+            self.fuse_norm_and_gate = True
+        else:
+            self.fuse_norm_and_gate = False
+            self.g_norm = RMSNorm(
+                hidden_size=self.head_v_dim,
+                elementwise_affine=elementwise_affine,
+                eps=norm_eps
+            )
+            self.gate_fn = ACT2FN[gate_fn]
+        # TODO: fix this issue
+        # https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/triton/rotary.py#L180
+        # Ideally, we would want to support arbitrary d_head_qk
+        assert self.head_k_dim <= 256, "head_k_dim must be less than or equal to 256"
+        self.rotary = RotaryEmbedding(dim=self.head_k_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        # launching the triton kernel for just one token will actually be slower
+        mode = 'fused_recurrent' if hidden_states.shape[1] <= 64 else self.mode
+        last_state = None
+        if past_key_values is not None and len(past_key_values) > self.layer_idx:
+            last_state = past_key_values[self.layer_idx]
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        if self.use_short_conv:
+            conv_state_q, conv_state_k, conv_state_v = None, None, None
+            if last_state is not None:
+                conv_state_q, conv_state_k, conv_state_v = last_state['conv_state']
+            conv_mask = attention_mask[:, -hidden_states.shape[1]:] if attention_mask is not None else None
+            q, conv_state_q = self.q_conv1d(
+                x=self.q_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_q,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            k, conv_state_k = self.k_conv1d(
+                x=self.k_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_k,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            v, conv_state_v = self.v_conv1d(
+                x=self.v_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_v,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+        else:
+            q = self.q_proj(hidden_states)
+            k = self.k_proj(hidden_states)
+            v = self.v_proj(hidden_states)
+        # dealing with left-padding
+        if attention_mask is not None:
+            v = v.mul_(attention_mask[:, -v.shape[-2]:, None])
+        q = rearrange(q, '... (h d) -> ... h d', d=self.head_k_dim)
+        k = rearrange(k, '... (h d) -> ... h d', d=self.head_k_dim)
+        if self.feature_map_fn is not None:
+            q, k = map(self.feature_map_fn, (q, k))
+        seqlen_offset, max_seqlen = 0, q.shape[1]
+        if past_key_values is not None:
+            seqlen_offset = past_key_values.get_seq_length(self.layer_idx)
+            max_seqlen = q.shape[1] + seqlen_offset
+            if attention_mask is not None:
+                # to deliminate the offsets of padding tokens
+                seqlen_offset = seqlen_offset + attention_mask.sum(-1) - attention_mask.shape[-1]
+                max_seqlen = q.shape[1] + max(seqlen_offset)
+        q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)
+        if self.num_kv_groups > 1:
+            k = repeat(k, 'b t h d -> b t (h g) d', g=self.num_kv_groups)
+            v = repeat(v, 'b t (h d) -> b t (h g) d', d=self.head_v_dim, g=self.num_kv_groups)
+        else:
+            v = rearrange(v, 'b t (h d) -> b t h d', d=self.head_v_dim)
+        recurrent_state = last_state['recurrent_state'] if last_state is not None else None
+        if mode == 'chunk':
+            o, recurrent_state = chunk_retention(
+                q=q,
+                k=k,
+                v=v,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        elif mode == 'fused_chunk':
+            o, recurrent_state = fused_chunk_retention(
+                q=q,
+                k=k,
+                v=v,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        elif mode == 'parallel':
+            o, recurrent_state = parallel_retention(
+                q=q,
+                k=k,
+                v=v,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        elif mode == 'fused_recurrent':
+            o, recurrent_state = fused_recurrent_retention(
+                q=q,
+                k=k,
+                v=v,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        else:
+            raise NotImplementedError(f"Not supported mode `{mode}`.")
+        if past_key_values is not None:
+            past_key_values.update(
+                recurrent_state=recurrent_state,
+                conv_state=(conv_state_q, conv_state_k, conv_state_v) if self.use_short_conv else None,
+                layer_idx=self.layer_idx,
+                offset=q.shape[1]
+            )
+        if self.use_output_gate:
+            g = self.g_proj(hidden_states)
+            if self.fuse_norm_and_gate:
+                g = rearrange(g, 'b t (h d) -> b t h d', d=self.head_v_dim)
+                o = self.g_norm_swish_gate(o, g)
+                o = rearrange(o, 'b t h d -> b t (h d)')
+            else:
+                o = rearrange(self.g_norm(o), 'b t h d -> b t (h d)')
+                o = o * self.gate_fn(g)
+        else:
+            o = rearrange(self.g_norm(o), 'b t h d -> b t (h d)')
+        o = self.o_proj(o)
+        return o, None, past_key_values
+    def state_size(self, **kwargs) -> int:
+        state_size = self.key_dim * self.head_v_dim
+        for module in self.children():
+            if isinstance(module, ShortConvolution):
+                state_size += module.state_size
+        return state_size

fla/layers/nsa.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from __future__ import annotations
+from typing import TYPE_CHECKING, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from einops import rearrange
+from transformers.utils import logging
+from fla.modules import RotaryEmbedding
+from fla.ops.nsa.parallel import parallel_nsa
+if TYPE_CHECKING:
+    from fla.models.utils import Cache
+logger = logging.get_logger(__name__)
+class NativeSparseAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        num_heads: int = 64,
+        num_kv_heads: Optional[int] = 4,
+        head_dim: int = 64,
+        qkv_bias: bool = False,
+        block_size: Optional[int] = 64,
+        block_counts: Optional[Union[torch.LongTensor, int]] = 16,
+        window_size: Optional[int] = 512,
+        rope_theta: Optional[float] = 10000.,
+        max_position_embeddings: Optional[int] = None,
+        layer_idx: int = None
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        if num_kv_heads is None:
+            self.num_kv_heads = self.num_heads
+        else:
+            self.num_kv_heads = num_kv_heads
+        self.num_kv_groups = num_heads // self.num_kv_heads
+        self.head_dim = head_dim
+        self.kv_dim = self.num_kv_heads * self.head_dim
+        self.qkv_bias = qkv_bias
+        self.block_size = block_size
+        self.block_counts = block_counts
+        self.window_size = window_size
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_idx = layer_idx
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=self.qkv_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=self.qkv_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=self.qkv_bias)
+        self.g_proj = nn.Linear(self.hidden_size, self.num_heads * 3, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary = RotaryEmbedding(dim=self.head_dim, base=self.rope_theta)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        batch_size, seq_len, _ = hidden_states.size()
+        q = rearrange(self.q_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
+        k = rearrange(self.k_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
+        v = rearrange(self.v_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
+        g = rearrange(self.g_proj(hidden_states), '... (h d) -> ... h d', d=3)
+        g_cmp, g_slc, g_swa = g.sigmoid().unbind(-1)
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        seqlen_offset, max_seqlen = 0, seq_len
+        if past_key_values is not None:
+            seqlen_offset = past_key_values.get_seq_length(self.layer_idx)
+            max_seqlen = q.shape[1] + seqlen_offset
+            if attention_mask is not None:
+                # to deliminate the offsets of padding tokens
+                seqlen_offset = seqlen_offset + attention_mask.sum(-1) - attention_mask.shape[-1]
+                max_seqlen = q.shape[1] + max(seqlen_offset)
+        if self.max_position_embeddings is not None:
+            max_seqlen = max(max_seqlen, self.max_position_embeddings)
+        q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)
+        if past_key_values is not None:
+            cache_has_content = past_key_values.get_seq_length(self.layer_idx) > 0
+            k_cached, v_cached = past_key_values.update(
+                attn_state=(k.flatten(-2, -1), v.flatten(-2, -1)),
+                layer_idx=self.layer_idx,
+                offset=seq_len,
+                cache_kwargs=dict(window_size=self.window_size)
+            )['attn_state']
+            if cache_has_content:
+                k, v = k_cached, v_cached
+                k = rearrange(k, '... (h d) -> ... h d', d=self.head_dim)
+                v = rearrange(v, '... (h d) -> ... h d', d=self.head_dim)
+        o = parallel_nsa(
+            q=q,
+            k=k,
+            v=v,
+            g_cmp=g_cmp,
+            g_slc=g_slc,
+            g_swa=g_swa,
+            block_size=self.block_size,
+            block_counts=self.block_counts,
+            window_size=self.window_size,
+            cu_seqlens=cu_seqlens,
+            head_first=False
+        )
+        o = o.reshape(batch_size, seq_len, -1)
+        o = self.o_proj(o)
+        if not output_attentions:
+            attentions = None
+        return o, attentions, past_key_values

fla/layers/rebased.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+"""
+https://github.com/corl-team/rebased/blob/main/flash_linear_attention/fla/layers/rebased_fast.py
+"""
+from __future__ import annotations
+from typing import Optional
+import torch
+import torch.nn as nn
+from einops import rearrange
+from fla.modules.feature_map import RebasedFeatureMap
+from fla.ops.linear_attn import chunk_linear_attn, fused_chunk_linear_attn
+from fla.ops.rebased import parallel_rebased
+class ReBasedLinearAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        l_max: int = 2048,
+        feature_dim: int = 16,
+        num_key_value_heads: int = 16,
+        num_heads: int = 16,
+        use_gamma: Optional[bool] = True,
+        use_beta: Optional[bool] = True,
+        normalize: Optional[bool] = True,
+        causal: bool = True,
+        eps: float = 1e-5,
+        mode: str = "parallel",
+        layer_idx: Optional[int] = None,
+        **kwargs
+    ) -> ReBasedLinearAttention:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.l_max = l_max
+        self.mode = mode
+        assert self.mode in ["fused_chunk", "parallel", 'chunk']
+        self.feature_dim = feature_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.num_heads = num_heads
+        self.head_dim = self.hidden_size // self.num_key_value_heads
+        self.use_gamma = use_gamma
+        self.use_beta = use_beta
+        self.normalize = normalize
+        self.causal = causal
+        self.eps = eps
+        self.mode = mode
+        self.layer_idx = layer_idx
+        self.feature_map = RebasedFeatureMap(self.feature_dim, use_gamma, use_beta, normalize)
+        self.q_proj = nn.Linear(self.hidden_size, self.feature_dim * self.num_heads, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.feature_dim * self.num_heads, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.dropout = nn.Identity()
+    def forward(self, hidden_states: torch.Tensor, **kwargs):
+        mode = self.mode
+        q, k, v = self.q_proj(hidden_states), self.k_proj(hidden_states), self.v_proj(hidden_states)
+        q, k, v = map(lambda x: rearrange(x, "... (h d) -> ... h d", d=self.head_dim), [q, k, v])
+        q, k = self.feature_map(q, flatten=(mode != 'parallel')), self.feature_map(k, flatten=(mode != 'parallel'))
+        if mode == "fused_chunk":
+            o = fused_chunk_linear_attn(
+                q=q,
+                k=k,
+                v=v,
+                normalize=True,
+                scale=1,
+                head_first=False
+            )
+        elif mode == 'chunk':
+            o = chunk_linear_attn(
+                q=q,
+                k=k,
+                v=v,
+                normalize=True,
+                scale=1,
+                head_first=False
+            )
+        elif mode == 'parallel':
+            assert q.shape[-1] <= 128
+            o = parallel_rebased(
+                q=q,
+                k=k,
+                v=v,
+                eps=self.eps,
+                use_scale=True,
+                use_normalize=True,
+                head_first=False
+            )
+        o = self.o_proj(o)
+        o = self.dropout(o)
+        return o
+    # https://github.com/HazyResearch/zoology/blob/main/zoology/mixers/based.py#L119
+    def forward_reference(
+        self,
+        hidden_states: torch.Tensor,
+        filters: torch.Tensor = None,
+        *args,
+        **kwargs
+    ):
+        """
+        x (torch.Tensor): tensor of shape (b, d, t)
+        y (torch.Tensor): tensor of shape (b, d, t)
+        """
+        b, t, _ = hidden_states.size()
+        q, k, v = self.q_proj(hidden_states), self.k_proj(hidden_states), self.v_proj(hidden_states)
+        q = q.view(b, t, -1, self.feature_dim).transpose(1, 2)
+        k = k.view(b, t, -1, self.feature_dim).transpose(1, 2)
+        v = v.view(b, t, -1, self.head_dim).transpose(1, 2)
+        # Linear attention
+        q, k = self.feature_map(q), self.feature_map(k)
+        q, k, v = q.unsqueeze(-2), k.unsqueeze(-2), v.unsqueeze(-1)
+        # Compute attention
+        if self.causal:
+            y = ((q * (k * v).cumsum(2)).sum(-1) / ((q * k.cumsum(2)).sum(-1) + self.eps))
+        else:
+            y = ((q * (k * v).sum(2, True)).sum(-1) / ((q * k.sum(2, True)).sum(-1) + self.eps))
+        y = rearrange(y, 'b h t d -> b t (h d)')
+        y = self.o_proj(y.to(hidden_states.dtype))
+        y = self.dropout(y)
+        return y.to(hidden_states.dtype)

fla/layers/rwkv6.py ADDED Viewed

	@@ -0,0 +1,307 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# "Eagle and Finch: RWKV with Matrix-Valued States and Dynamic Recurrence"[https://arxiv.org/abs/2404.05892]
+from __future__ import annotations
+from typing import TYPE_CHECKING, Optional, Tuple
+import torch
+import torch.nn as nn
+from einops import rearrange
+from fla.modules import GroupNorm
+from fla.modules.activations import ACT2FN
+from fla.ops.rwkv6 import chunk_rwkv6, fused_recurrent_rwkv6
+if TYPE_CHECKING:
+    from fla.models.utils import Cache
+class RWKV6Attention(nn.Module):
+    def __init__(
+        self,
+        mode: str = 'chunk',
+        hidden_size: int = 1024,
+        expand_k: float = 0.5,
+        expand_v: float = 1.0,
+        num_heads: int = 4,
+        gate_fn: str = 'swish',
+        proj_low_rank_dim: int = 32,
+        gate_low_rank_dim: int = 64,
+        fuse_norm: bool = True,
+        elementwise_affine: Optional[bool] = True,
+        norm_eps: float = 1e-5,
+        layer_idx: int = None,
+        **kwargs
+    ) -> RWKV6Attention:
+        super().__init__()
+        self.mode = mode
+        self.hidden_size = hidden_size
+        self.expand_k = expand_k
+        self.expand_v = expand_v
+        self.num_heads = num_heads
+        self.proj_low_rank_dim = proj_low_rank_dim
+        self.gate_low_rank_dim = gate_low_rank_dim
+        self.key_dim = int(hidden_size * expand_k)
+        self.value_dim = int(hidden_size * expand_v)
+        self.layer_idx = layer_idx
+        assert mode in ['chunk', 'fused_recurrent'], f"Not suppoerted mode `{mode}`."
+        assert self.key_dim % num_heads == 0, f"key dim must be divisible by num_heads of {num_heads}"
+        assert self.value_dim % num_heads == 0, f"value dim must be divisible by num_heads of {num_heads}"
+        self.head_k_dim = self.key_dim // num_heads
+        self.head_v_dim = self.value_dim // num_heads
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        self.x_proj = nn.Sequential(
+            LerpLinear(hidden_size, proj_low_rank_dim * 5),
+            nn.Tanh(),
+            nn.Linear(proj_low_rank_dim * 5, hidden_size, bias=False)
+        )
+        self.x_bias = nn.Parameter(torch.zeros(5, hidden_size))
+        self.r_proj = DDLerpLinear(hidden_size, self.key_dim)
+        self.w_proj = DDLerpLinear(hidden_size, self.key_dim, low_rank_dim=gate_low_rank_dim)
+        self.k_proj = DDLerpLinear(hidden_size, self.key_dim)
+        self.v_proj = DDLerpLinear(hidden_size, self.value_dim)
+        self.g_proj = DDLerpLinear(hidden_size, self.value_dim)
+        self.bonus = nn.Parameter(torch.zeros(num_heads, self.head_k_dim))
+        # TODO: fuse GroupNorm and output gate
+        self.g_norm = GroupNorm(self.num_heads, self.value_dim, elementwise_affine=elementwise_affine, bias=True, eps=norm_eps)
+        self.o_proj = nn.Linear(self.value_dim, hidden_size, bias=False)
+        self.gate_fn = ACT2FN[gate_fn]
+        self.apply(self._initialize_weights)
+    def _initialize_weights(self, module: nn.Module):
+        if getattr(module, "_is_hf_initialized", False):
+            return
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        if isinstance(module, nn.Parameter):
+            nn.init.xavier_uniform_(module, gain=2 ** -2.5)
+        module._is_hf_initialized = True
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        batch_size, seq_len, hidden_size = hidden_states.shape
+        # launching the triton kernel for just one token will actually be slower
+        mode = 'fused_recurrent' if hidden_states.shape[1] <= 64 else self.mode
+        last_state = None
+        if past_key_values is not None and len(past_key_values) > self.layer_idx:
+            last_state = past_key_values[self.layer_idx]
+        if attention_mask is not None:
+            hidden_states = hidden_states.mul_(attention_mask[:, -hidden_states.shape[-2]:, None])
+        if hidden_states.shape[1] == 1 and last_state is not None:
+            shifted = last_state['conv_state'].unsqueeze(1)
+        else:
+            shifted = self.time_shift(hidden_states)
+            if last_state is not None:
+                shifted[:, 0] = last_state['conv_state']
+        delta = shifted - hidden_states
+        x = self.x_proj[0](hidden_states, delta).view(batch_size, seq_len, -1, self.proj_low_rank_dim)
+        x = torch.einsum('b t n r, h n r-> b t n h', self.x_proj[1](x), self.x_proj[2].weight.view(hidden_size, 5, -1))
+        r, w, k, v, g = x.add_(self.x_bias).unbind(-2)
+        r = self.r_proj(hidden_states, r, delta)
+        w = self.w_proj(hidden_states, w, delta)
+        k = self.k_proj(hidden_states, k, delta)
+        v = self.v_proj(hidden_states, v, delta)
+        g = self.g_proj(hidden_states, g, delta)
+        # dealing with left-padding
+        if attention_mask is not None:
+            v = v.mul_(attention_mask[:, -v.shape[-2]:, None])
+        r, w, k = map(lambda x: rearrange(x, 'b t (h d) -> b t h d', d=self.head_k_dim), (r, w, k))
+        v = rearrange(v, 'b t (h d) -> b t h d', d=self.head_v_dim)
+        w = -torch.exp(w)
+        u = self.bonus
+        recurrent_state = last_state['recurrent_state'] if last_state is not None else None
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        if mode == 'fused_recurrent':
+            o, recurrent_state = fused_recurrent_rwkv6(
+                r=r,
+                k=k,
+                v=v,
+                w=w,
+                u=u,
+                scale=1.,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        elif mode == 'chunk':
+            o, recurrent_state = chunk_rwkv6(
+                q=r,
+                k=k,
+                v=v,
+                g=w,
+                u=u,
+                scale=1.,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        else:
+            raise NotImplementedError(f"Not supported mode `{mode}`.")
+        if past_key_values is not None:
+            past_key_values.update(
+                recurrent_state=recurrent_state,
+                conv_state=hidden_states[:, -1],
+                layer_idx=self.layer_idx,
+                offset=r.shape[2]
+            )
+        o = self.g_norm(rearrange(o, '... h d -> ... (h d)')) * self.gate_fn(g)
+        o = self.o_proj(o)
+        return o, None, past_key_values
+class LoRA(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int,
+        low_rank_dim: int,
+        bias: Optional[bool] = True,
+        activation: Optional[str] = 'tanh'
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.low_rank_dim = low_rank_dim
+        self.bias = bias
+        if activation is None:
+            self.activation = nn.Identity()
+        elif activation == 'sigmoid':
+            self.activation = nn.Sigmoid()
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+        elif activation == 'relu':
+            self.activation = nn.ReLU()
+        else:
+            raise ValueError(f"Not supported activation `{activation}`.")
+        self.lora = nn.Sequential(
+            nn.Linear(input_dim, low_rank_dim, bias=False),
+            self.activation,
+            nn.Linear(low_rank_dim, output_dim, bias=bias)
+        )
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}("
+        s += f"input_dim={self.input_dim}, low_rank_dim={self.low_rank_dim}, output_dim={self.output_dim}"
+        if not self.bias:
+            s += f", bias={self.bias}"
+        s += ")"
+        return s
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lora(x)
+class LerpLinear(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int,
+        low_rank_dim: Optional[int] = None
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.low_rank_dim = low_rank_dim
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        if low_rank_dim is None:
+            self.linear = nn.Linear(input_dim, output_dim, bias=False)
+        else:
+            self.linear = LoRA(input_dim, output_dim, low_rank_dim)
+        self.mu = nn.Parameter(torch.zeros(input_dim))
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}({self.input_dim}, {self.output_dim}"
+        if self.low_rank_dim is not None:
+            s += f", low_rank_dim={self.low_rank_dim}"
+        s += ")"
+        return s
+    def forward(self, x: torch.Tensor, delta: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if delta is None:
+            shifted = self.time_shift(x)
+            if len(shifted.shape) == 2:
+                shifted = shifted.unsqueeze(1)
+            delta = shifted - x
+        return self.linear(x + delta * self.mu)
+class DDLerpLinear(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int,
+        low_rank_dim: Optional[int] = None
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.low_rank_dim = low_rank_dim
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        if low_rank_dim is None:
+            self.linear = nn.Linear(input_dim, output_dim, bias=False)
+        else:
+            self.linear = LoRA(input_dim, output_dim, low_rank_dim)
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}({self.input_dim}, {self.output_dim}"
+        if self.low_rank_dim is not None:
+            s += f", low_rank_dim={self.low_rank_dim}"
+        s += ")"
+        return s
+    def forward(self, x: torch.Tensor, mu: torch.Tensor, delta: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if delta is None:
+            shifted = self.time_shift(x)
+            if len(shifted.shape) == 2:
+                shifted = shifted.unsqueeze(1)
+            delta = shifted - x
+        return self.linear(x + delta * mu)

fla/layers/simple_gla.py ADDED Viewed

	@@ -0,0 +1,261 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from __future__ import annotations
+from typing import TYPE_CHECKING, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from fla.modules import FusedRMSNormGated, RMSNorm, ShortConvolution
+from fla.modules.activations import ACT2FN
+from fla.ops.simple_gla import chunk_simple_gla, fused_recurrent_simple_gla
+if TYPE_CHECKING:
+    from fla.models.utils import Cache
+class SimpleGatedLinearAttention(nn.Module):
+    r"""
+    The layer implementaion for [Gated Linear Attention Transformers with Hardware-Efficient Training](https://arxiv.org/abs/2312.06635).  # noqa
+    This layer calls the simplified GLA kernel in which the gating is head-wise instead of elementwise.
+    Args:
+        mode (str, Optional):
+            Which GLA kernel to use.
+            Currently available: `chunk`.
+            Default: `chunk`.
+        hidden_size (int, Optional):
+            The hidden size of the input. Default: 1024.
+        expand_k (float, Optional):
+            The expansion ratio for the key dim. Default: 1.0.
+        expand_v (float, Optional):
+            The expansion ratio for the value dim. Default: 1.0.
+        num_heads (int, Optional):
+            The number of heads. Default: 4.
+        num_kv_heads (int, Optional):
+            The number of key/value heads, used for MQA. Default: None.
+        feature_map (str, Optional):
+            Feature map function applied to queries/keys. Default: None.
+        use_short_conv (bool, Optional):
+            Whether to use short convolutions. Default: `False`.
+        conv_size (int, Optional):
+            The kernel size of the short convolution, only used when `use_short_conv` is `True`. Default: 4.
+        conv_bias (bool, Optional):
+            Whether to use bias in the short convolution, only used when `use_short_conv` is `True`. Default: `False`.
+        gate_fn (str, Optional):
+            The activation function for the output gate. Default: `swish`.
+        elementwise_affine (bool, Optional):
+            If `True`, applies elementwise affine to LayerNorm with learnable parameters. Default: `True`.
+        norm_eps (float, Optional):
+            The epsilon value for the layernorm/rmsnorm layer. Default: 1e-5.
+        gate_logit_normalizer (int, Optional):
+            The normalizer for the gate logits, appied after `logsigmoid`. Default: 16.
+        fuse_norm (bool, Optional):
+            Whether to fuse the norm and the output gate for better memory footprint. Default: `True`.
+        layer_idx (int, Optional):
+            The index of the layer. Default: None.
+    """
+    def __init__(
+        self,
+        mode: str = 'chunk',
+        hidden_size: int = 1024,
+        expand_k: float = 1.,
+        expand_v: float = 1.,
+        num_heads: int = 4,
+        num_kv_heads: Optional[int] = None,
+        feature_map: Optional[str] = None,
+        use_short_conv: bool = True,
+        conv_size: int = 4,
+        conv_bias: bool = False,
+        gate_fn: str = 'swish',
+        elementwise_affine: Optional[bool] = True,
+        norm_eps: float = 1e-5,
+        gate_logit_normalizer: int = 16,
+        fuse_norm: bool = True,
+        layer_idx: int = None,
+    ) -> SimpleGatedLinearAttention:
+        super().__init__()
+        self.mode = mode
+        self.hidden_size = hidden_size
+        self.expand_k = expand_k
+        self.expand_v = expand_v
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads if num_kv_heads is not None else num_heads
+        self.num_kv_groups = self.num_heads // self.num_kv_heads
+        self.feature_map_fn = ACT2FN[feature_map] if feature_map is not None else None
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.conv_bias = conv_bias
+        self.key_dim = int(hidden_size * expand_k)
+        self.value_dim = int(hidden_size * expand_v)
+        self.key_dim_per_group = self.key_dim // self.num_kv_groups
+        self.value_dim_per_group = self.value_dim // self.num_kv_groups
+        self.layer_idx = layer_idx
+        assert mode in ['chunk', "fused_recurrent"], f"Not suppoerted mode `{mode}`."
+        assert self.key_dim % num_heads == 0, f"key dim must be divisible by num_heads of {num_heads}"
+        assert self.value_dim % num_heads == 0, f"value dim must be divisible by num_heads of {num_heads}"
+        self.head_k_dim = self.key_dim // num_heads
+        self.head_v_dim = self.value_dim // num_heads
+        self.q_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.k_proj = nn.Linear(hidden_size, self.key_dim_per_group, bias=False)
+        self.v_proj = nn.Linear(hidden_size, self.value_dim_per_group, bias=False)
+        self.g_proj = nn.Linear(hidden_size, self.value_dim, bias=False)
+        if use_short_conv:
+            self.conv_size = conv_size
+            self.q_conv1d = ShortConvolution(self.key_dim, conv_size, activation='silu')
+            self.k_conv1d = ShortConvolution(self.key_dim_per_group, conv_size, activation='silu')
+            self.v_conv1d = ShortConvolution(self.value_dim_per_group, conv_size, activation='silu')
+        self.gk_proj = nn.Linear(hidden_size, self.num_heads)
+        if gate_fn == 'swish' and fuse_norm:
+            self.g_norm_swish_gate = FusedRMSNormGated(
+                hidden_size=self.head_v_dim,
+                elementwise_affine=elementwise_affine,
+                eps=norm_eps
+            )
+            self.fuse_norm_and_gate = True
+        else:
+            self.fuse_norm_and_gate = False
+            self.g_norm = RMSNorm(
+                hidden_size=self.head_v_dim,
+                elementwise_affine=elementwise_affine,
+                eps=norm_eps
+            )
+            self.gate_fn = ACT2FN[gate_fn]
+        self.o_proj = nn.Linear(self.value_dim, hidden_size, bias=False)
+        self.gate_logit_normalizer = gate_logit_normalizer
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        # launching the triton kernel for just one token will actually be slower
+        mode = 'fused_recurrent' if hidden_states.shape[1] <= 64 else self.mode
+        last_state = None
+        if past_key_values is not None and len(past_key_values) > self.layer_idx:
+            last_state = past_key_values[self.layer_idx]
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        if self.use_short_conv:
+            conv_state_q, conv_state_k, conv_state_v = None, None, None
+            if last_state is not None:
+                conv_state_q, conv_state_k, conv_state_v = last_state['conv_state']
+            conv_mask = attention_mask[:, -hidden_states.shape[1]:] if attention_mask is not None else None
+            q, conv_state_q = self.q_conv1d(
+                x=self.q_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_q,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            k, conv_state_k = self.k_conv1d(
+                x=self.k_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_k,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+            v, conv_state_v = self.v_conv1d(
+                x=self.v_proj(hidden_states),
+                mask=conv_mask,
+                cache=conv_state_v,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens
+            )
+        else:
+            q = self.q_proj(hidden_states)
+            k = self.k_proj(hidden_states)
+            v = self.v_proj(hidden_states)
+        gk = self.gk_proj(hidden_states)
+        if self.feature_map_fn is not None:
+            q, k = map(self.feature_map_fn, (q, k))
+        # dealing with left-padding
+        if attention_mask is not None:
+            v = v.mul_(attention_mask[:, -v.shape[-2]:, None])
+        q = rearrange(q, '... (h d) -> ... h d', h=self.num_heads)
+        if self.num_kv_groups > 1:
+            k, v = (repeat(x, '... (h d) -> ... (h g) d', h=self.num_kv_heads, g=self.num_kv_groups) for x in (k, v))
+        else:
+            k, v = (rearrange(x, '... (h d) -> ... h d', h=self.num_kv_heads) for x in (k, v))
+        gk = F.logsigmoid(gk) / self.gate_logit_normalizer
+        recurrent_state = last_state['recurrent_state'] if last_state is not None else None
+        if mode == 'chunk':
+            o, recurrent_state = chunk_simple_gla(
+                q=q,
+                k=k,
+                v=v,
+                gk=gk,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        elif mode == 'fused_recurrent':
+            o, recurrent_state = fused_recurrent_simple_gla(
+                q=q,
+                k=k,
+                v=v,
+                gk=gk,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False
+            )
+        else:
+            raise NotImplementedError(f"Not supported mode `{mode}`.")
+        if past_key_values is not None:
+            past_key_values.update(
+                recurrent_state=recurrent_state,
+                conv_state=(conv_state_q, conv_state_k, conv_state_v) if self.use_short_conv else None,
+                layer_idx=self.layer_idx,
+                offset=q.shape[1]
+            )
+        g = self.g_proj(hidden_states)
+        if self.fuse_norm_and_gate:
+            g = rearrange(g, 'b t (h d) -> b t h d', h=self.num_heads)
+            o = self.g_norm_swish_gate(o, g)
+            o = rearrange(o, 'b t h d -> b t (h d)')
+        else:
+            o = rearrange(self.g_norm(o), 'b t h d -> b t (h d)')
+            o = o * self.gate_fn(g)
+        o = self.o_proj(o)
+        return o, None, past_key_values
+    def state_size(self, **kwargs) -> int:
+        state_size = self.key_dim * self.head_v_dim
+        for module in self.children():
+            if isinstance(module, ShortConvolution):
+                state_size += module.state_size
+        return state_size

fla/ops/__init__.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# -*- coding: utf-8 -*-
+from .abc import chunk_abc
+from .attn import parallel_attn, parallel_rectified_attn, parallel_softpick_attn, naive_attn, naive_rectified_attn, naive_softpick_attn
+from .based import fused_chunk_based, parallel_based
+from .delta_rule import chunk_delta_rule, fused_chunk_delta_rule, fused_recurrent_delta_rule
+from .forgetting_attn import parallel_forgetting_attn
+from .gated_delta_rule import chunk_gated_delta_rule, fused_recurrent_gated_delta_rule
+from .generalized_delta_rule import (
+    chunk_dplr_delta_rule,
+    chunk_iplr_delta_rule,
+    fused_recurrent_dplr_delta_rule,
+    fused_recurrent_iplr_delta_rule
+)
+from .gla import chunk_gla, fused_chunk_gla, fused_recurrent_gla
+from .gsa import chunk_gsa, fused_recurrent_gsa
+from .hgrn import fused_recurrent_hgrn
+from .lightning_attn import chunk_lightning_attn, fused_recurrent_lightning_attn
+from .linear_attn import chunk_linear_attn, fused_chunk_linear_attn, fused_recurrent_linear_attn
+from .nsa import parallel_nsa
+from .retention import chunk_retention, fused_chunk_retention, fused_recurrent_retention, parallel_retention
+from .rwkv6 import chunk_rwkv6, fused_recurrent_rwkv6
+from .rwkv7 import chunk_rwkv7, fused_recurrent_rwkv7
+from .simple_gla import chunk_simple_gla, fused_recurrent_simple_gla, parallel_simple_gla
+__all__ = [
+    'chunk_abc',
+    'parallel_attn', 'parallel_rectified_attn', 'parallel_softpick_attn',
+    'naive_attn', 'naive_rectified_attn', 'naive_softpick_attn',
+    'fused_chunk_based', 'parallel_based',
+    'chunk_delta_rule', 'fused_chunk_delta_rule', 'fused_recurrent_delta_rule',
+    'parallel_forgetting_attn',
+    'chunk_gated_delta_rule', 'fused_recurrent_gated_delta_rule',
+    'chunk_dplr_delta_rule', 'chunk_iplr_delta_rule',
+    'fused_recurrent_dplr_delta_rule', 'fused_recurrent_iplr_delta_rule',
+    'chunk_gla', 'fused_chunk_gla', 'fused_recurrent_gla',
+    'chunk_gsa', 'fused_recurrent_gsa',
+    'fused_recurrent_hgrn',
+    'chunk_lightning_attn', 'fused_recurrent_lightning_attn',
+    'chunk_linear_attn', 'fused_chunk_linear_attn', 'fused_recurrent_linear_attn',
+    'parallel_nsa',
+    'chunk_retention', 'fused_chunk_retention', 'fused_recurrent_retention', 'parallel_retention',
+    'chunk_rwkv6', 'fused_recurrent_rwkv6',
+    'chunk_rwkv7', 'fused_recurrent_rwkv7',
+    'chunk_simple_gla', 'fused_recurrent_simple_gla', 'parallel_simple_gla',
+]

fla/ops/attn/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# -*- coding: utf-8 -*-
+from .parallel import parallel_attn
+from .parallel_rectified import parallel_rectified_attn
+from .parallel_softpick import parallel_softpick_attn
+from .naive import naive_attn
+from .naive_rectified import naive_rectified_attn
+from .naive_softpick import naive_softpick_attn
+__all__ = [
+    'parallel_attn',
+    'parallel_rectified_attn',
+    'parallel_softpick_attn',
+    'naive_attn',
+    'naive_rectified_attn',
+    'naive_softpick_attn',
+]

fla/ops/attn/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (646 Bytes). View file

fla/ops/attn/__pycache__/naive.cpython-311.pyc ADDED Viewed

Binary file (2.1 kB). View file

fla/ops/attn/__pycache__/naive_rectified.cpython-311.pyc ADDED Viewed

Binary file (2.34 kB). View file

fla/ops/attn/__pycache__/parallel.cpython-311.pyc ADDED Viewed

Binary file (34 kB). View file

fla/ops/attn/naive.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import torch
+from typing import Optional
+from einops import rearrange
+def naive_attn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False
+) -> torch.Tensor:
+    head_dim = q.shape[-1]
+    if scale is None:
+        scale = 1.0 / (head_dim ** 0.5)
+    if not head_first:
+        q, k, v = map(lambda x: rearrange(x, 'b t h d -> b h t d'), (q, k, v))
+    q_len = q.shape[-2]
+    k_len = k.shape[-2]
+    mask = torch.tril(torch.ones(k_len, k_len, device=q.device))
+    wei = torch.matmul(q, k.transpose(2, 3)) # shape: (batch_size, num_heads, q_len, k_len)
+    wei = wei * scale
+    wei = wei.masked_fill(mask[k_len-q_len:k_len, :k_len] == 0, float('-inf'))
+    wei = torch.softmax(wei.float(), dim=-1).to(q.dtype)
+    o = torch.matmul(wei, v) # shape: (batch_size, num_heads, q_len, head_dim)
+    if not head_first:
+        o = rearrange(o, 'b h t d -> b t h d')
+    return o, wei

fla/ops/attn/naive_rectified.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+from typing import Optional
+from einops import rearrange
+def naive_rectified_attn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False
+) -> torch.Tensor:
+    head_dim = q.shape[-1]
+    if scale is None:
+        scale = 1.0 / (head_dim ** 0.5)
+    if not head_first:
+        q, k, v = map(lambda x: rearrange(x, 'b t h d -> b h t d'), (q, k, v))
+    q_len = q.shape[-2]
+    k_len = k.shape[-2]
+    mask = torch.tril(torch.ones(k_len, k_len, device=q.device))
+    wei = torch.matmul(q, k.transpose(2, 3)) # shape: (batch_size, num_heads, q_len, k_len)
+    wei = wei * scale
+    wei = torch.where(wei >= 0, wei, float('-inf'))
+    wei = wei.masked_fill(mask[k_len-q_len:k_len, :k_len] == 0, float('-inf'))
+    wei = torch.softmax(wei.float(), dim=-1).to(q.dtype)
+    wei = torch.nan_to_num(wei, nan=0.0)
+    o = torch.matmul(wei, v) # shape: (batch_size, num_heads, q_len, head_dim)
+    if not head_first:
+        o = rearrange(o, 'b h t d -> b t h d')
+    return o, wei

fla/ops/attn/naive_softpick.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+import torch.nn.functional as F
+from typing import Optional
+from einops import rearrange
+def softpick(x, dim=-1, eps=1e-8):
+    # softpick function: relu(exp(x)-1) / sum(abs(exp(x)-1))
+    # numerically stable version
+    x_m = torch.max(x, dim=dim, keepdim=True).values
+    x_m_e_m = torch.exp(-x_m)
+    x_e_1 = torch.exp(x - x_m) - x_m_e_m
+    r_x_e_1 = F.relu(x_e_1)
+    a_x_e_1 = torch.where(x.isfinite(), torch.abs(x_e_1), 0)
+    return r_x_e_1 / (torch.sum(a_x_e_1, dim=dim, keepdim=True) + eps) # epsilon is only useful if all inputs are EXACTLY 0. we might not even need it
+def naive_softpick_attn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False
+) -> torch.Tensor:
+    head_dim = q.shape[-1]
+    if scale is None:
+        scale = 1.0 / (head_dim ** 0.5)
+    if not head_first:
+        q, k, v = map(lambda x: rearrange(x, 'b t h d -> b h t d'), (q, k, v))
+    q_len = q.shape[-2]
+    k_len = k.shape[-2]
+    mask = torch.tril(torch.ones(k_len, k_len, device=q.device))
+    wei = torch.matmul(q, k.transpose(2, 3)) # shape: (batch_size, num_heads, q_len, k_len)
+    wei = wei * scale
+    wei = wei.masked_fill(mask[k_len-q_len:k_len, :k_len] == 0, float('-inf'))
+    wei = softpick(wei.float(), dim=-1).to(q.dtype)
+    o = torch.matmul(wei, v) # shape: (batch_size, num_heads, q_len, head_dim)
+    if not head_first:
+        o = rearrange(o, 'b h t d -> b t h d')
+    return o, wei