Spaces:

RishabA
/

image-captioning

Sleeping

App Files Files Community

RishabA commited on Apr 7

Commit

10c688c

verified ·

1 Parent(s): b34a24b

Upload 4 files

Browse files

Files changed (4) hide show

app.py +87 -0
image_captioning_model.pt +3 -0
model.py +442 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+from torchvision import transforms
+from PIL import Image
+import gradio as gr
+from transformers import AutoTokenizer
+from model import CaptioningTransformer
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+image_size = 128
+patch_size = 8
+d_model = 192
+n_layers = 6
+n_heads = 8
+tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+transform = transforms.Compose(
+    [
+        transforms.Resize(image_size),
+        transforms.CenterCrop(image_size),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ]
+)
+# Instantiate your model
+model = CaptioningTransformer(
+    image_size=image_size,
+    in_channels=3,  # RGB images
+    vocab_size=tokenizer.vocab_size,
+    device=device,
+    patch_size=patch_size,
+    n_layers=n_layers,
+    d_model=d_model,
+    n_heads=n_heads,
+).to(device)
+# Load your pre-trained weights (make sure the .pt file is in your repo)
+model_path = "image_captioning_model.pt"
+model.load_state_dict(torch.load(model_path, map_location=device))
+model.eval()
+# This is your existing inference function (you can modify as needed)
+def make_prediction(model, sos_token, eos_token, image, max_len, temp, device):
+    log_tokens = [sos_token]  # Start with the start-of-sequence token
+    with torch.inference_mode():
+        # Get image embeddings from the encoder
+        image_embedding = model.encoder(image.to(device))
+        for _ in range(max_len):
+            input_tokens = torch.cat(log_tokens, dim=1)
+            data_pred = model.decoder(input_tokens.to(device), image_embedding)
+            # Get the logits for the most recent token only
+            dist = torch.distributions.Categorical(logits=data_pred[:, -1] / temp)
+            next_tokens = dist.sample().reshape(1, 1)
+            log_tokens.append(next_tokens.cpu())
+            if next_tokens.item() == 102:  # Assuming 102 is your [SEP] token
+                break
+    return torch.cat(log_tokens, dim=1)
+# Define the Gradio prediction function
+def predict(image: Image.Image):
+    # Preprocess the image
+    img_tensor = transform(image).unsqueeze(0)  # Shape: (1, 3, image_size, image_size)
+    # Create a start-of-sequence token (assuming 101 is your [CLS] token)
+    sos_token = 101 * torch.ones(1, 1).long().to(device)
+    # Generate caption tokens using your inference function
+    tokens = make_prediction(
+        model, sos_token, 102, img_tensor, max_len=50, temp=0.5, device=device
+    )
+    # Decode tokens to text (skipping special tokens)
+    caption = tokenizer.decode(tokens[0], skip_special_tokens=True)
+    return caption
+# Create a Gradio interface
+iface = gr.Interface(
+    fn=predict,
+    inputs=gr.Image(type="pil"),
+    outputs="text",
+    title="Image Captioning Model",
+    description="Upload an image and get a caption generated by the model.",
+)
+if __name__ == "__main__":
+    iface.launch()

image_captioning_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:526b7cb3a1a70d6bb5503629b69e9d664efd0ba8f22a7cc1d035b9a42f6abc24
+size 72371272

model.py ADDED Viewed

	@@ -0,0 +1,442 @@

+import torch
+import torch.nn as nn
+import math
+class PatchEmbedding(nn.Module):
+    def __init__(self, in_channels: int = 3, patch_size: int = 16, d_model: int = 128):
+        super().__init__()
+        self.patch_size = patch_size
+        self.d_model = d_model
+        self.unfold = nn.Unfold(kernel_size=patch_size, stride=patch_size)
+        self.proj = nn.Linear(in_channels * patch_size * patch_size, d_model)
+    def forward(self, x):
+        batch_size, c, h, w = x.shape
+        # Unfold to extract patches: shape becomes (batch_size, in_channels * patch_size * patch_size, num_patches)
+        # num_patches = (H / patch_size) * (W / patch_size)
+        patches = self.unfold(x)
+        # Transpose to (batch_size, num_patches, in_channels * patch_size * patch_size)
+        patches = patches.transpose(1, 2)
+        # Apply linear projection to each patch: (batch_size, num_patches, in_channels * patch_size * patch_size) -> (batch_size, num_patches, d_model)
+        return self.proj(patches)
+# Positional Encoding
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model: int):
+        """
+        d_model: dimensions of the embeddings (number of values in each embedding vector)
+        """
+        super().__init__()
+        # Intead of precomputing fixed values, we will compute in the forward pass based off of the sinusodiual encoding formula
+        self.d_model = d_model
+    def forward(self, x):
+        device = x.device
+        half_dim = self.d_model // 2  # Use half for sin and half for cos
+        emb = math.log(10000.0) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]  # (batch_size, half_dim)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+# Multi-Head Self-Attention
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_model: int = 512, n_heads: int = 8, dropout: float = 0.1):
+        """
+        d_model: dimensions of the embeddings (number of values in each embedding vector)
+        n_heads: number of self attention heads per sequence
+        dropout: probability of dropout
+        """
+        super().__init__()
+        assert (
+            d_model % n_heads == 0
+        )  # We want to make sure that the dimensions are split evenly among the attention heads.
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.d_key = d_model // n_heads
+        self.Wq = nn.Linear(d_model, d_model)  # Learnable weights for query
+        self.Wk = nn.Linear(d_model, d_model)  # Learnable weights for key
+        self.Wv = nn.Linear(d_model, d_model)  # Learnable weights for value
+        self.Wo = nn.Linear(d_model, d_model)  # Learnable weights for output
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, query, key, value, mask=None):
+        """
+        query: (batch_size, q_length, d_model)
+        key: (batch_size, k_length, d_model)
+        value: (batch_size, s_length, d_model)
+        """
+        batch_size = key.size(0)
+        # Matrix multiplication for Q, K, and V tensors
+        Q = self.Wq(query)
+        K = self.Wk(key)
+        V = self.Wv(value)
+        # Split each tensor into heads
+        Q = Q.view(batch_size, -1, self.n_heads, self.d_key).permute(
+            0, 2, 1, 3
+        )  # (batch_size, n_heads, q_length, d_key)
+        K = K.view(batch_size, -1, self.n_heads, self.d_key).permute(
+            0, 2, 1, 3
+        )  # (batch_size, n_heads, k_length, d_key)
+        V = V.view(batch_size, -1, self.n_heads, self.d_key).permute(
+            0, 2, 1, 3
+        )  # (batch_size, n_heads, v_length, d_key)
+        # Scaled dot product
+        # K^T becomees (batch_size, n_heads, d_key, k_length)
+        scaled_dot_product = torch.matmul(Q, K.permute(0, 1, 3, 2)) / math.sqrt(
+            self.d_key
+        )  # (batch_size, n_heads, q_length, k_length)
+        if mask is not None:
+            scaled_dot_product = scaled_dot_product.masked_fill(
+                mask == 0, -float("inf")
+            )  # Filling it with 0 would result in 1 after the mask because e^0 = 1. Intead we fill it with an infinitley large negative number
+        # Softmax function for attention probabilities
+        attention_probs = torch.softmax(scaled_dot_product, dim=-1)
+        # Multiply by V to get attention with respect to the values
+        A = torch.matmul(self.dropout(attention_probs), V)
+        # Reshape attention back to (batch_size, q_length, d_model)
+        A = (
+            A.permute(0, 2, 1, 3)
+            .contiguous()
+            .view(batch_size, -1, self.n_heads * self.d_key)
+        )
+        # Pass through the final linear layer
+        output = self.Wo(A)
+        return (
+            output,
+            attention_probs,
+        )  # Output shape: (batch_size, q_length, d_model), Attention probs shape: (batch_size, n_heads, q_length, k_length)
+# Position-Wise Feed Forward Network (FFN)
+class PositionwiseFeedForward(nn.Module):
+    def __init__(self, d_model: int, dropout: float = 0.1):
+        """
+        d_model: dimensions of the embeddings (number of values in each embedding vector)
+        dropout: probability of dropout
+        """
+        super().__init__()
+        self.ffn = nn.Sequential(
+            nn.Linear(in_features=d_model, out_features=(d_model * 4)),
+            nn.GELU(),
+            nn.Linear(in_features=(d_model * 4), out_features=d_model),
+            nn.Dropout(p=dropout),
+        )
+    def forward(self, x):
+        return self.ffn(x)
+# Encoder Layer
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1):
+        """
+        d_model: dimensions of the embeddings (number of values in each embedding vector)
+        n_heads: number of self attention heads per sequence
+        dropout: probability of dropout
+        """
+        super().__init__()
+        # Multi-Head Self-Attention sublayer
+        self.attention = MultiHeadAttention(
+            d_model=d_model, n_heads=n_heads, dropout=dropout
+        )
+        self.attention_layer_norm = nn.LayerNorm(d_model)  # Layer normalization
+        # Position-wise Feed-forward Network
+        self.position_wise_ffn = PositionwiseFeedForward(
+            d_model=d_model, dropout=dropout
+        )
+        self.ffn_layer_norm = nn.LayerNorm(d_model)  # Layer normalization
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, src):
+        """
+        src: embedded sequences (batch_size, seq_length, d_model)
+        """
+        # Multi-Head Attention
+        _src, attention_probs = self.attention(
+            src, src, src, None
+        )  # Q, K, V, src_mask: we don't need a source mask because all images are the same dimension
+        # Residual Addition and Layer Normalization
+        src = self.attention_layer_norm(
+            src + self.dropout(_src)
+        )  # We do residual addition by adding back the src (the embeddings) to the output of Self-Attention
+        # Position-wise Feed-forward Network
+        _src = self.position_wise_ffn(src)
+        # Residual Addition and Layer Normalization
+        src = self.ffn_layer_norm(src + self.dropout(_src))
+        return src, attention_probs
+# The Encoder that takes in images and returns the encoding to be passed into the decoder
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        image_size: int,
+        in_channels: int,
+        patch_size: int = 16,
+        d_model: int = 128,
+        n_layers: int = 3,
+        n_heads: int = 4,
+        dropout: float = 0.1,
+    ):
+        """
+        d_model: dimensions of the embeddings (number of values in each embedding vector)
+        n_layers: number of encoder layers in the encoder block
+        n_heads: number of self attention heads per sequence
+        dropout: probability of dropout
+        """
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_emb = PatchEmbedding(
+            patch_size=patch_size, in_channels=in_channels, d_model=d_model
+        )
+        seq_length = (image_size // patch_size) ** 2
+        # Image src is going to use a learnable positional encoding
+        self.pos_embedding = nn.Parameter(
+            torch.empty(1, seq_length, d_model).normal_(std=0.02)
+        )
+        # Create n_layers encoders
+        self.layers = nn.ModuleList(
+            [
+                EncoderLayer(d_model=d_model, n_heads=n_heads, dropout=dropout)
+                for layer in range(n_layers)
+            ]
+        )
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, src):
+        """
+        src: embedded sequences (batch_size, seq_length, d_model)
+        """
+        # Extract the patches and apply a linear layer
+        batch_size = src.shape[0]
+        src = self.patch_emb(src)
+        # Add the learned positional embedding
+        src = src + self.pos_embedding
+        # Pass the sequences through each encoder layer
+        for layer in self.layers:
+            src, attention_probs = layer(src)
+        self.attention_probs = attention_probs
+        return src
+# Decoder Layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1):
+        """
+        d_model: dimensions of the embeddings (number of values in each embedding vector)
+        n_heads: number of self attention heads per sequence
+        dropout: probability of dropout
+        """
+        super().__init__()
+        # Masked Multi-Head Self-Attention sublayer
+        self.masked_attention = MultiHeadAttention(
+            d_model=d_model, n_heads=n_heads, dropout=dropout
+        )
+        self.masked_attention_layer_norm = nn.LayerNorm(d_model)  # Layer normalization
+        # Multi-Head Self-Attention sublayer
+        self.attention = MultiHeadAttention(
+            d_model=d_model, n_heads=n_heads, dropout=dropout
+        )
+        self.attention_layer_norm = nn.LayerNorm(d_model)  # Layer normalization
+        # Position-wise Feed-forward Network
+        self.position_wise_ffn = PositionwiseFeedForward(
+            d_model=d_model, dropout=dropout
+        )
+        self.ffn_layer_norm = nn.LayerNorm(d_model)  # Layer normalization
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, trg, src, trg_mask):
+        """
+        trg: embedded captions (batch_size, trg_seq_length, d_model)
+        src: embedded images (batch_size, src_seq_length, d_model)
+        trg_mask: mask for the captions preventing peeking at future tokens (batch_size, 1, trg_seq_length, trg_seq_length)
+        """
+        # Masked Multi-Head Attention
+        # The target mask is used to prevent the model from seeing future tokens. This ensures that the prediction is made solely based on past and present tokens.
+        _trg, masked_attention_probs = self.masked_attention(
+            trg, trg, trg, trg_mask
+        )  # Q, K, V, mask
+        # Residual Addition and Layer Normalization
+        trg = self.masked_attention_layer_norm(trg + self.dropout(_trg))
+        # Multi-Head Attention - This time, we also pass in the output of the encoder layers as src.
+        # This is important because this allows us to keep track of and learn relationships between the input and output tokens.
+        _trg, attention_probs = self.attention(trg, src, src, None)  # Q, K, V, mask
+        # Residual Addition and Layer Normalization
+        trg = self.attention_layer_norm(trg + self.dropout(_trg))
+        # Position-wise Feed-forward Network
+        _trg = self.position_wise_ffn(trg)
+        # Residual Addition and Layer Normalization
+        trg = self.ffn_layer_norm(trg + self.dropout(_trg))
+        return trg, attention_probs, masked_attention_probs
+# The Decoder Module that takes the encoded images from the encoder and generates captions
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        vocab_size: int,
+        d_model: int = 128,
+        n_layers: int = 3,
+        n_heads: int = 4,
+        dropout: float = 0.1,
+    ):
+        """
+        vocab_size: size of the target vocabulary
+        d_model: dimensions of the embeddings (number of values in each embedding vector)
+        n_layers: number of encoder layers in the encoder block
+        n_heads: number of self attention heads per sequence
+        dropout: probability of dropout
+        """
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, d_model)
+        self.embedding.weight.data = 0.001 * self.embedding.weight.data
+        # Initialize sinusoidal positional embeddings
+        self.pos_emb = PositionalEncoding(d_model=d_model)
+        # Create n_layers decoders
+        self.layers = nn.ModuleList(
+            [
+                DecoderLayer(d_model=d_model, n_heads=n_heads, dropout=dropout)
+                for layer in range(n_layers)
+            ]
+        )
+        self.dropout = nn.Dropout(p=dropout)
+        # Output layer
+        self.Wo = nn.Linear(in_features=d_model, out_features=vocab_size)
+    def make_trg_mask(self, trg):
+        seq_length = trg.shape[1]
+        trg_mask = torch.tril(
+            torch.ones((seq_length, seq_length), device=trg.device)
+        ).bool()
+        return trg_mask.unsqueeze(0).unsqueeze(
+            0
+        )  # (batch_size=1, n_heads=1, seq_length, seq_length)
+    def forward(self, trg, src):
+        """
+        trg: target sequences (batch_size, trg_seq_length, d_model)
+        src: embedding images (batch_size, src_seq_length, d_model)
+        """
+        # Embed the target captions
+        trg = self.embedding(trg)
+        batch_size, l, h = trg.shape
+        trg_index = torch.arange(l, device=trg.device)
+        pos_emb = self.pos_emb(trg_index).reshape(1, l, h).expand(batch_size, l, h)
+        # Add the fixed sinusodial positional embedding
+        trg += pos_emb
+        # Create a target mask for the target captions to prevent the model from peeking at future tokens
+        trg_mask = self.make_trg_mask(
+            trg
+        )  # (batch_size, 1, trg_seq_length, trg_seq_length)
+        # Pass the sequences through each decoder layer
+        for layer in self.layers:
+            trg, attention_probs, masked_attention_probs = layer(trg, src, trg_mask)
+        self.attention_probs = attention_probs
+        self.masked_attention_probs = masked_attention_probs  # (batch_size, n_heads, trg_seq_len, src_seq_len) trg_seq_len: length of the target caption \ src_seq_len: number of patches from the encoder
+        # Final linear output layer
+        return self.Wo(trg)
+class CaptioningTransformer(nn.Module):
+    def __init__(
+        self,
+        image_size: int,
+        in_channels: int,
+        vocab_size: int,
+        device,
+        patch_size: int = 16,
+        d_model: int = 128,
+        n_layers: int = 3,
+        n_heads: int = 4,
+    ):
+        super().__init__()
+        self.device = device
+        # Create an encoder and decoder with specified parameters
+        self.encoder = Encoder(
+            image_size=image_size,
+            in_channels=in_channels,
+            patch_size=patch_size,
+            d_model=d_model,
+            n_layers=n_layers,
+            n_heads=n_heads,
+        )
+        self.decoder = Decoder(
+            vocab_size=vocab_size, d_model=d_model, n_layers=n_layers, n_heads=n_heads
+        )
+    def forward(self, src, trg):
+        # Encoder layers
+        src = self.encoder(src)  # (batch_size, src_seq_length, d_model)
+        # Decoder layers
+        output = self.decoder(
+            trg, src
+        )  # Pass in both the target (for Masked Multi-Head Self-Attention) and source for (Cross-Attention)
+        return output

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+torchvision
+transformers
+gradio
+Pillow