Spaces:

bartduis
/

rayst3r

Running on Zero

App Files Files Community

bartduis commited on Jun 20

Commit

d43e49d

verified ·

1 Parent(s): 0a76fc3

Update models/pos_embed.py

Browse files

Files changed (1) hide show

models/pos_embed.py +48 -55

models/pos_embed.py CHANGED Viewed

@@ -100,61 +100,54 @@ def interpolate_pos_embed(model, checkpoint_model):
 # RoPE2D: RoPE implementation in 2D
 #----------------------------------------------------------
-try:
-    from extensions.curope import cuRoPE2D
-    RoPE2D = cuRoPE2D
-except ImportError:
-    # critical error, we need to use the slow pytorch version
-    print("CUDA-compiled version of RoPE2D is required but could not be found. Please compile the CUDA extension before running.")
-    #raise ImportError("CUDA-compiled version of RoPE2D is required but could not be found. Please compile the CUDA extension before running.")
-    class RoPE2D(torch.nn.Module):
-        def __init__(self, freq=100.0, F0=1.0):
-            super().__init__()
-            self.base = freq
-            self.F0 = F0
-            self.cache = {}
-        def get_cos_sin(self, D, seq_len, device, dtype):
-            if (D,seq_len,device,dtype) not in self.cache:
-                inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
-                t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
-                freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
-                freqs = torch.cat((freqs, freqs), dim=-1)
-                cos = freqs.cos() # (Seq, Dim)
-                sin = freqs.sin()
-                self.cache[D,seq_len,device,dtype] = (cos,sin)
-            return self.cache[D,seq_len,device,dtype]
-        @staticmethod
-        def rotate_half(x):
-            x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
-            return torch.cat((-x2, x1), dim=-1)
-        def apply_rope1d(self, tokens, pos1d, cos, sin):
-            assert pos1d.ndim==2
-            cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
-            sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
-            return (tokens * cos) + (self.rotate_half(tokens) * sin)
-        def forward(self, tokens, positions):
-            """
-            input:
-                * tokens: batch_size x nheads x ntokens x dim
-                * positions: batch_size x ntokens x 2 (y and x position of each token)
-            output:
-                * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim)
-            """
-            # tokens = tokens.to(torch.float32)
-            # #positions = positions.to(torch.float32)
-            # assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two"
-            # D = tokens.size(3) // 2
-            # assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2
-            # cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype)
-            # # split features into two along the feature dimension, and apply rope1d on each half
-            # y, x = tokens.chunk(2, dim=-1)
-            # y = self.apply_rope1d(y, positions[:,:,0], cos, sin)
-            # x = self.apply_rope1d(x, positions[:,:,1], cos, sin)
-            # tokens = torch.cat((y, x), dim=-1)
-            return tokens

 # RoPE2D: RoPE implementation in 2D
 #----------------------------------------------------------
+class RoPE2D(torch.nn.Module):
+    def __init__(self, freq=100.0, F0=1.0):
+        super().__init__()
+        self.base = freq
+        self.F0 = F0
+        self.cache = {}
+    def get_cos_sin(self, D, seq_len, device, dtype):
+        if (D,seq_len,device,dtype) not in self.cache:
+            inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+            t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+            freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+            freqs = torch.cat((freqs, freqs), dim=-1)
+            cos = freqs.cos() # (Seq, Dim)
+            sin = freqs.sin()
+            self.cache[D,seq_len,device,dtype] = (cos,sin)
+        return self.cache[D,seq_len,device,dtype]
+    @staticmethod
+    def rotate_half(x):
+        x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+    def apply_rope1d(self, tokens, pos1d, cos, sin):
+        assert pos1d.ndim==2
+        cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+        sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+        return (tokens * cos) + (self.rotate_half(tokens) * sin)
+    def forward(self, tokens, positions):
+        """
+        input:
+            * tokens: batch_size x nheads x ntokens x dim
+            * positions: batch_size x ntokens x 2 (y and x position of each token)
+        output:
+            * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim)
+        """
+        # tokens = tokens.to(torch.float32)
+        # #positions = positions.to(torch.float32)
+        # assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two"
+        # D = tokens.size(3) // 2
+        # assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2
+        # cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype)
+        # # split features into two along the feature dimension, and apply rope1d on each half
+        # y, x = tokens.chunk(2, dim=-1)
+        # y = self.apply_rope1d(y, positions[:,:,0], cos, sin)
+        # x = self.apply_rope1d(x, positions[:,:,1], cos, sin)
+        # tokens = torch.cat((y, x), dim=-1)
+        return tokens