Spaces:

kfoughali
/

serpent

Sleeping

App Files Files Community

kfoughali commited on Jul 29

Commit

069fc7a

verified ·

1 Parent(s): f97b87b

Update core/mamba_block.py

Browse files

Files changed (1) hide show

core/mamba_block.py +16 -5

core/mamba_block.py CHANGED Viewed

@@ -7,7 +7,7 @@ import math
 class MambaBlock(nn.Module):
     """
     Production-ready Mamba block for graph processing
-    Based on official Mamba implementation with graph optimizations
     """
     def __init__(self, d_model, d_state=16, d_conv=4, expand=2, dt_rank="auto", bias=False):
         super().__init__()
@@ -56,6 +56,11 @@ class MambaBlock(nn.Module):
         Returns: (batch, length, d_model)
         """
         batch, length, _ = x.shape
         # Input projection and split
         xz = self.in_proj(x)  # (batch, length, 2 * d_inner)
@@ -81,6 +86,7 @@ class MambaBlock(nn.Module):
     def selective_scan(self, u):
         """Selective scan operation - core of Mamba"""
         batch, length, d_inner = u.shape
         # Compute ∆, B, C
         x_dbl = self.x_proj(u)  # (batch, length, dt_rank + 2*d_state)
@@ -92,15 +98,20 @@ class MambaBlock(nn.Module):
         return self._selective_scan_pytorch(u, delta, B, C)
     def _selective_scan_pytorch(self, u, delta, B, C):
-        """PyTorch implementation of selective scan"""
         batch, length, d_inner = u.shape
         # Discretize
-        deltaA = torch.exp(delta.unsqueeze(-1) * (-torch.exp(self.A_log)))  # (batch, length, d_inner, d_state)
         deltaB_u = delta.unsqueeze(-1) * B.unsqueeze(2) * u.unsqueeze(-1)  # (batch, length, d_inner, d_state)
         # Initialize state
-        x = torch.zeros((batch, d_inner, self.d_state), device=u.device, dtype=u.dtype)
         ys = []
         for i in range(length):
@@ -111,6 +122,6 @@ class MambaBlock(nn.Module):
         y = torch.stack(ys, dim=1)  # (batch, length, d_inner)
         # Add skip connection
-        y = y + u * self.D
         return y

 class MambaBlock(nn.Module):
     """
     Production-ready Mamba block for graph processing
+    Device-safe implementation
     """
     def __init__(self, d_model, d_state=16, d_conv=4, expand=2, dt_rank="auto", bias=False):
         super().__init__()
         Returns: (batch, length, d_model)
         """
         batch, length, _ = x.shape
+        device = x.device
+        # Ensure all parameters are on correct device
+        self.A_log = self.A_log.to(device)
+        self.D = self.D.to(device)
         # Input projection and split
         xz = self.in_proj(x)  # (batch, length, 2 * d_inner)
     def selective_scan(self, u):
         """Selective scan operation - core of Mamba"""
         batch, length, d_inner = u.shape
+        device = u.device
         # Compute ∆, B, C
         x_dbl = self.x_proj(u)  # (batch, length, dt_rank + 2*d_state)
         return self._selective_scan_pytorch(u, delta, B, C)
     def _selective_scan_pytorch(self, u, delta, B, C):
+        """PyTorch implementation of selective scan - device safe"""
         batch, length, d_inner = u.shape
+        device = u.device
+        # Ensure A_log and D are on correct device
+        A_log = self.A_log.to(device)
+        D = self.D.to(device)
         # Discretize
+        deltaA = torch.exp(delta.unsqueeze(-1) * (-torch.exp(A_log)))  # (batch, length, d_inner, d_state)
         deltaB_u = delta.unsqueeze(-1) * B.unsqueeze(2) * u.unsqueeze(-1)  # (batch, length, d_inner, d_state)
         # Initialize state
+        x = torch.zeros((batch, d_inner, self.d_state), device=device, dtype=u.dtype)
         ys = []
         for i in range(length):
         y = torch.stack(ys, dim=1)  # (batch, length, d_inner)
         # Add skip connection
+        y = y + u * D
         return y