Spaces:

RishabA
/

image-captioning

Sleeping

App Files Files Community

RishabA commited on Apr 7

Commit

42296ae

verified ·

1 Parent(s): 10c688c

Update model.py

Browse files

Files changed (1) hide show

model.py +13 -19

model.py CHANGED Viewed

@@ -3,28 +3,23 @@ import torch.nn as nn
 import math
-class PatchEmbedding(nn.Module):
-    def __init__(self, in_channels: int = 3, patch_size: int = 16, d_model: int = 128):
         super().__init__()
         self.patch_size = patch_size
-        self.d_model = d_model
         self.unfold = nn.Unfold(kernel_size=patch_size, stride=patch_size)
-        self.proj = nn.Linear(in_channels * patch_size * patch_size, d_model)
     def forward(self, x):
         batch_size, c, h, w = x.shape
-        # Unfold to extract patches: shape becomes (batch_size, in_channels * patch_size * patch_size, num_patches)
-        # num_patches = (H / patch_size) * (W / patch_size)
-        patches = self.unfold(x)
-        # Transpose to (batch_size, num_patches, in_channels * patch_size * patch_size)
-        patches = patches.transpose(1, 2)
-        # Apply linear projection to each patch: (batch_size, num_patches, in_channels * patch_size * patch_size) -> (batch_size, num_patches, d_model)
-        return self.proj(patches)
 # Positional Encoding
@@ -139,7 +134,7 @@ class PositionwiseFeedForward(nn.Module):
         self.ffn = nn.Sequential(
             nn.Linear(in_features=d_model, out_features=(d_model * 4)),
-            nn.GELU(),
             nn.Linear(in_features=(d_model * 4), out_features=d_model),
             nn.Dropout(p=dropout),
         )
@@ -218,9 +213,8 @@ class Encoder(nn.Module):
         self.patch_size = patch_size
-        self.patch_emb = PatchEmbedding(
-            patch_size=patch_size, in_channels=in_channels, d_model=d_model
-        )
         seq_length = (image_size // patch_size) ** 2
@@ -245,7 +239,7 @@ class Encoder(nn.Module):
         # Extract the patches and apply a linear layer
         batch_size = src.shape[0]
-        src = self.patch_emb(src)
         # Add the learned positional embedding
         src = src + self.pos_embedding

 import math
+class ExtractPatches(nn.Module):
+    def __init__(self, patch_size: int = 16):
         super().__init__()
         self.patch_size = patch_size
         self.unfold = nn.Unfold(kernel_size=patch_size, stride=patch_size)
     def forward(self, x):
         batch_size, c, h, w = x.shape
+        # Unfold applies a slding window to generate patches
+        # The transpose and reshape change the shape to (batch_size, num_patches, 3 * patch_size * patch_size), flattening the patches
+        return (
+            self.unfold(x)
+            .transpose(1, 2)
+            .reshape(batch_size, -1, c * self.patch_size * self.patch_size)
+        )
 # Positional Encoding
         self.ffn = nn.Sequential(
             nn.Linear(in_features=d_model, out_features=(d_model * 4)),
+            nn.ReLU(),
             nn.Linear(in_features=(d_model * 4), out_features=d_model),
             nn.Dropout(p=dropout),
         )
         self.patch_size = patch_size
+        self.extract_patches = ExtractPatches(patch_size=patch_size)
+        self.fc_in = nn.Linear(in_channels * patch_size * patch_size, d_model)
         seq_length = (image_size // patch_size) ** 2
         # Extract the patches and apply a linear layer
         batch_size = src.shape[0]
+        src = self.fc_in(self.extract_patches(src))
         # Add the learned positional embedding
         src = src + self.pos_embedding