Spaces:

ariG23498
/

nanovlm

Running on Zero

App Files Files Community

ariG23498 HF Staff commited on 8 days ago

Commit

f2c2a4e

1 Parent(s): 1d47577

add demo

Browse files

Files changed (16) hide show

.gitignore +2 -0
app.py +159 -0
cat.jpg +3 -0
data/__init__.py +0 -0
data/collators.py +106 -0
data/datasets.py +92 -0
data/processors.py +17 -0
models/README.md +23 -0
models/__init__.py +0 -0
models/config.py +57 -0
models/language_model.py +399 -0
models/modality_projector.py +46 -0
models/utils.py +22 -0
models/vision_language_model.py +237 -0
models/vision_transformer.py +251 -0
requirements.txt +80 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .venv
2	+ __pycache__

app.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import gradio as gr
+import spaces
+import torch
+from PIL import Image
+# Set random seeds for reproducibility
+torch.manual_seed(0)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(0)
+from models.vision_language_model import VisionLanguageModel
+from data.processors import get_tokenizer, get_image_processor
+@spaces.GPU
+def generate_outputs(image, query):
+    # Determine device
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    # Load model
+    hf_model = "lusxvr/nanoVLM-222M"
+    try:
+        model = VisionLanguageModel.from_pretrained(hf_model).to(device)
+        model.eval()
+    except Exception as e:
+        return f"Error loading model: {str(e)}", None, None, None, None
+    # Load tokenizer and image processor
+    try:
+        tokenizer = get_tokenizer(model.cfg.lm_tokenizer)
+        image_processor = get_image_processor(model.cfg.vit_img_size)
+    except Exception as e:
+        return f"Error loading tokenizer or image processor: {str(e)}", None, None, None, None
+    # Prepare text input
+    template = f"Question: {query} Answer:"
+    encoded = tokenizer.batch_encode_plus([template], return_tensors="pt")
+    tokens = encoded["input_ids"].to(device)
+    # Process image
+    try:
+        img = image.convert("RGB")
+        img_t = image_processor(img).unsqueeze(0).to(device)
+    except Exception as e:
+        return f"Error processing image: {str(e)}", None, None, None, None
+    # Generate four outputs
+    outputs = []
+    max_new_tokens = 50  # Fixed value from provided script
+    try:
+        for _ in range(4):
+            gen = model.generate(tokens, img_t, max_new_tokens=max_new_tokens)
+            out = tokenizer.batch_decode(gen, skip_special_tokens=True)[0]
+            outputs.append(out)
+    except Exception as e:
+        return f"Error during generation: {str(e)}", None, None, None, None
+    return None, outputs[0], outputs[1], outputs[2], outputs[3]
+def main():
+    # Define minimal CSS for subtle aesthetic enhancements
+    css = """
+    .gradio-container {
+        font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+        padding: 20px;
+    }
+    h1 {
+        color: #333;
+        text-align: center;
+        margin-bottom: 20px;
+    }
+    .description {
+        margin-bottom: 20px;
+        line-height: 1.6;
+    }
+    .gr-button {
+        padding: 10px 20px;
+    }
+    """
+    # Define Gradio interface
+    with gr.Blocks(css=css, title="nanoVLM Image-to-Text Generator") as app:
+        gr.Markdown(
+            "# nanoVLM Image-to-Text Generator"
+        )
+        gr.Markdown(
+            """
+            <div class="description">
+                This demo showcases <b>nanoVLM</b>, a lightweight vision-language model by HuggingFace.
+                Upload an image and provide a query to generate four text descriptions.
+                The model is based on the <a href="https://github.com/huggingface/nanoVLM/" target="_blank">nanoVLM repository</a>
+                and uses the pretrained model <a href="https://huggingface.co/lusxvr/nanoVLM-222M" target="_blank">lusxvr/nanoVLM-222M</a>.
+                nanoVLM is designed for efficient image-to-text generation, ideal for resource-constrained environments.
+            </div>
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(
+                    type="pil",
+                    label="Upload Image",
+                    value="cat.jpg"  # Set example image
+                )
+                query_input = gr.Textbox(
+                    label="Query",
+                    value="What is this?",
+                    placeholder="Enter your query here",
+                    lines=2
+                )
+                submit_button = gr.Button("Generate")
+            with gr.Column():
+                error_output = gr.Textbox(
+                    label="Errors (if any)",
+                    placeholder="No errors",
+                    visible=True,
+                    interactive=False
+                )
+                output1 = gr.Textbox(
+                    label="Generation 1",
+                    placeholder="Output 1 will appear here...",
+                    lines=3
+                )
+                output2 = gr.Textbox(
+                    label="Generation 2",
+                    placeholder="Output 2 will appear here...",
+                    lines=3
+                )
+                output3 = gr.Textbox(
+                    label="Generation 3",
+                    placeholder="Output 3 will appear here...",
+                    lines=3
+                )
+                output4 = gr.Textbox(
+                    label="Generation 4",
+                    placeholder="Output 4 will appear here...",
+                    lines=3
+                )
+        # Define action on submit
+        submit_button.click(
+            fn=generate_outputs,
+            inputs=[image_input, query_input],
+            outputs=[error_output, output1, output2, output3, output4]
+        )
+    # Launch the app
+    app.launch()
+if __name__ == "__main__":
+    main()

cat.jpg ADDED Viewed

Git LFS Details

SHA256: dea9e7ef97386345f7cff32f9055da4982da5471c48d575146c796ab4563b04e
Pointer size: 131 Bytes
Size of remote file: 173 kB

data/__init__.py ADDED Viewed

File without changes

data/collators.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import torch
+class VQACollator(object):  # Visual Question Answering Collator
+    def __init__(self, tokenizer, max_length):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __call__(self, batch):
+        images = [item["image"] for item in batch]
+        texts = [item["text_data"] for item in batch]
+        answers = [item["answer"] for item in batch]
+        # Stack images
+        images = torch.stack(images)
+        # Create inputs by concatenating the question and answer
+        input_sequences = []
+        for i in range(len(texts)):
+            input_sequences.append(f"{texts[i]}{answers[i]}")
+        encoded_full_sequences = self.tokenizer.batch_encode_plus(
+            input_sequences,
+            padding="max_length",
+            padding_side="left",
+            return_tensors="pt",
+            truncation=True,
+            max_length=self.max_length,
+        )
+        # Create labels where only answer tokens are predicted
+        input_ids = encoded_full_sequences["input_ids"]
+        attention_mask = encoded_full_sequences["attention_mask"]
+        labels = input_ids.clone()
+        labels[:, :-1] = input_ids[:, 1:].clone()
+        labels[:, -1] = -100 #self.tokenizer.pad_token_id
+        # The tokenizer has different behavior for padding and truncation:
+        # 1. If the full text (answer + question) is shorter than the max length, it gets padded on the left
+        # 2. If the full text is longer than the max length, it gets truncated on the right
+        # Therefore, I need to handle multiple cases, this is the different scenarios:
+        # If the full text is longer than the max length, we need to set the labels to -100 for the whole sample (we want to ignore the whole sample)
+        # If the full text is shorter than the max length, we need to set the labels to -100 only for the question part, and create causal language modeling labels for the answer part, taking into account the padding
+        # Determine if sequences were truncated
+        original_lengths = [len(self.tokenizer.encode(seq)) for seq in input_sequences]
+        for i in range(len(batch)):
+            # Get the length of the question for this sample
+            question_length = len(self.tokenizer.encode(texts[i], add_special_tokens=False))
+            # Case 1: If sequence was truncated (original is longer than max_length)
+            if original_lengths[i] > self.max_length:
+                # Set all labels to -100 to ignore this sample entirely
+                labels[i, :] = -100
+                #print(f"Sample {i} was truncated. Setting all labels to -100.")
+                continue
+            # Case 2: Sequence fits within max_length
+            # Use attention mask to find first non-padding token
+            # The first 1 in the attention mask marks the first non-padding token
+            first_token_pos = attention_mask[i].nonzero(as_tuple=True)[0][0].item()
+            # Set labels for padding and question part to -100 (don't predict these), substracting 1 to account for the left shift
+            question_end = first_token_pos + question_length - 1
+            labels[i, :question_end] = -100
+            # labels[i, original_lengths[i]-1:] = -100 # If you are using right padding
+        return {
+            "image": images,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels
+        }
+class MMStarCollator(object):  # https://huggingface.co/datasets/Lin-Chen/MMStar
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+    def __call__(self, batch):
+        images = [item["image"] for item in batch]
+        questions = [item["text_data"] for item in batch]
+        answers = [item["answer"] for item in batch]
+        # Stack images
+        images = torch.stack(images)
+        encoded_question_sequences = self.tokenizer.batch_encode_plus(
+            questions,
+            padding=True,
+            padding_side="left",
+            return_tensors="pt"
+        )
+        encoded_answer_sequences = self.tokenizer.batch_encode_plus(
+            answers,
+            padding=True,
+            padding_side="left",
+            return_tensors="pt"
+        )
+        return {
+            "images": images,
+            "input_ids": encoded_question_sequences['input_ids'],
+            "attention_mask": encoded_question_sequences['attention_mask'],
+            "labels": encoded_answer_sequences['input_ids'],
+        }

data/datasets.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import torch
+from PIL import Image
+from torch.utils.data import Dataset
+import models.config as cfg
+class VQADataset(Dataset):  # Visual Question Answering Dataset
+    def __init__(self, dataset, tokenizer, image_processor):
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        # Handle image (it's a list)
+        image_data = item['images']
+        if isinstance(image_data, list) and len(image_data) > 0:
+            image = image_data[0]
+        else:
+            image = image_data
+        # Now process the image
+        if isinstance(image, Image.Image):
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
+            processed_image = self.image_processor(image)
+        else:
+            print(f"Error processing image at index {idx}")
+            # Create empty tensor with right dimensions as fallback
+            processed_image = torch.zeros(
+                3, cfg.VLMConfig.vit_img_size, cfg.VLMConfig.vit_img_size)
+        # Process text (also a list)
+        text_data = item['texts']
+        if isinstance(text_data, list) and len(text_data) > 0:
+            text = text_data[0]
+        else:
+            text = text_data
+        question = text['user']
+        # Add EOS token to the answer to train model to predict it, enabling correct stopping during generation
+        answer = text['assistant'] + self.tokenizer.eos_token
+        formatted_text = f"Question: {question} Answer:"
+        return {
+            "image": processed_image,
+            "text_data": formatted_text,
+            "answer": answer
+        }
+class MMStarDataset(Dataset):  # https://huggingface.co/datasets/Lin-Chen/MMStar
+    def __init__(self, dataset, tokenizer, image_processor):
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        image = item['image']
+        # Now process the image
+        if isinstance(image, Image.Image):
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
+            processed_image = self.image_processor(image)
+        else:
+            print(f"Error processing image at index {idx}")
+            # Create empty tensor with right dimensions as fallback
+            processed_image = torch.zeros(3, cfg.VLMConfig.vit_img_size, cfg.VLMConfig.vit_img_size)
+        question = item['question']
+        answer = item['answer'] + self.tokenizer.eos_token # Add EOS token to the answer to train model to predict it, enabling correct stopping during generation
+        formatted_text = f"Question: {question} \nAnswer only with the letter! \nAnswer:"
+        return {
+            "image": processed_image,
+            "text_data": formatted_text,
+            "answer": answer
+        }

data/processors.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from transformers import AutoTokenizer
+import torchvision.transforms as transforms
+TOKENIZERS_CACHE = {}
+def get_tokenizer(name):
+    if name not in TOKENIZERS_CACHE:
+        tokenizer = AutoTokenizer.from_pretrained(name, use_fast=True)
+        tokenizer.pad_token = tokenizer.eos_token
+        TOKENIZERS_CACHE[name] = tokenizer
+    return TOKENIZERS_CACHE[name]
+def get_image_processor(img_size):
+    return transforms.Compose([
+        transforms.Resize((img_size, img_size)),
+        transforms.ToTensor()
+    ])

models/README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# Models
+## Vision Backbone (ViT)
+This is a very lightweight Vision Transformer in native pytorch. I took inspiration from the following sources:
+- https://github.com/karpathy/nanoGPT (General Transformer Decoder)
+- https://arxiv.org/abs/2010.11929 (ViT Paper)
+- https://arxiv.org/abs/2303.15343 (SigLiP Paper)
+- https://github.com/huggingface/transformers/blob/main/src/transformers/models/siglip/modeling_siglip.py (HF SigLiP Implementation)
+## Language Model (Llama / SmolLM)
+This is a decoder only LM, following the Llama 2/3 architecture. Inspiration from the following sources:
+- https://arxiv.org/pdf/2307.09288 (Original Llama Paper)
+- https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py (HF Llama Implementation)
+## Modality Projection
+This is a simple MLP (Linear Layer) for the Modality Projection between the Image Patch Encodings and the Language Embedding Space with a simple Pixel Shuffle (https://arxiv.org/pdf/2504.05299)
+## Vision-Language-Model
+This brings all the individual parts together and handles the concatenation of images and text. Built as a simple version of SmolVLM (https://arxiv.org/pdf/2504.05299)

models/__init__.py ADDED Viewed

File without changes

models/config.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from dataclasses import dataclass
+@dataclass
+class VLMConfig:
+    vit_hidden_dim: int = 768
+    vit_inter_dim: int = 4 * vit_hidden_dim
+    vit_patch_size: int = 16
+    vit_img_size: int = 224
+    vit_n_heads: int = 12
+    vit_dropout: float = 0.0
+    vit_n_blocks: int = 12
+    vit_ln_eps: float = 1e-6
+    vit_cls_flag: bool = False
+    vit_model_type: str = 'google/siglip-base-patch16-224'
+    lm_hidden_dim: int = 576
+    lm_inter_dim: int = 1536
+    lm_rms_eps: float = 1e-5
+    lm_re_base: int = 100000
+    lm_max_position_embeddings: int = 8192
+    lm_vocab_size: int = 49152
+    lm_n_heads: int = 9
+    lm_n_kv_heads: int = 3
+    lm_dropout: float = 0.0
+    lm_n_blocks: int = 30
+    lm_attn_scaling: float = 1.0
+    lm_max_length: int = 128 - 49  # Deduct the image token length to achieve a 'nice number'
+    lm_use_tokens: bool = False # Decide if the LM expects tokens or embeddings as input (if using as a backbone for the VLM, set to False)
+    lm_tie_weights: bool = True # Decide if you want to tie the LM Head weight to the token embedding weights
+    lm_model_type: str = 'HuggingFaceTB/SmolLM2-135M'
+    lm_tokenizer: str = 'HuggingFaceTB/cosmo2-tokenizer'
+    lm_eos_token_id: int = 0
+    mp_pixel_shuffle_factor: int = 2
+    vlm_load_backbone_weights: bool = True
+    vlm_checkpoint_path: str = 'checkpoints/nanoVLM-222M'
+@dataclass
+class TrainConfig:
+    lr_mp: float = 2e-3
+    lr_backbones: float = 1e-4
+    data_cutoff_idx: int = None
+    val_ratio: float = 0.01
+    batch_size: int = 256
+    mmstar_batch_size: int = 32
+    eval_in_epochs: bool = True
+    epochs: int = 5
+    compile: bool = True
+    resume_from_vlm_checkpoint: bool = False # Indicate if the training should be resumed from a checkpoint of the whole VLM or you want to start from scratch
+    train_dataset_path: str = 'HuggingFaceM4/the_cauldron'
+    train_dataset_name: tuple[str, ...] = ("ai2d", "aokvqa", "chart2text", "chartqa", "clevr", "cocoqa", "datikz", "diagram_image_to_text", "docvqa", "dvqa", "figureqa", "finqa", "geomverse", "hateful_memes", "hitab", "iam", "iconqa", "infographic_vqa", "intergps", "localized_narratives", "mapqa", "multihiertt", "ocrvqa", "plotqa", "raven", "rendered_text", "robut_sqa", "robut_wikisql", "robut_wtq", "scienceqa", "screen2words", "st_vqa", "tabmwp", "tallyqa", "tat_qa", "textcaps", "textvqa", "tqa", "vistext", "visual7w", "visualmrc", "vqarad", "vqav2", "vsr", "websight") # "clevr_math", "okvqa", "spot_the_diff", "nlvr2", "mimic_cgd",
+    test_dataset_path: str = "Lin-Chen/MMStar"
+    wandb_entity: str = "HuggingFace" # Indicate the entity to log to in wandb
+    log_wandb: bool = True

models/language_model.py ADDED Viewed

	@@ -0,0 +1,399 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L69
+class RMSNorm(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(cfg.lm_hidden_dim))
+        self.eps = cfg.lm_rms_eps
+    def forward(self, x):
+        irms = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps) # inverse of RMS
+        x = x * irms * self.weight
+        return x
+# Multiple derivates of Rotary Embeddings by now, this is a basic one with linear scaling to context length
+# e.g. https://github.com/huggingface/smollm/blob/main/vision/m4/models/vllama3/modeling_vllama3.py#L190
+class RotaryEmbedding(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        assert cfg.lm_hidden_dim % cfg.lm_n_heads == 0, "Hidden dimension must be divisible by number of heads"
+        self.dim = cfg.lm_hidden_dim // cfg.lm_n_heads # dim of each head
+        self.base = cfg.lm_re_base
+        self.max_seq_len = cfg.lm_max_position_embeddings
+        # Standard RoPE implementation - create frequencies for each dimension
+        # freq_i = 1 / (base^(2i/dim)) where i is the dimension index
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.original_max_seq_len = cfg.lm_max_position_embeddings
+        self.attention_scaling = cfg.lm_attn_scaling
+    @torch.no_grad()
+    def forward(self, position_ids):
+        batch_size, seq_len = position_ids.shape
+        # Dynamic scaling for longer sequences
+        max_seq = position_ids.max() + 1
+        if max_seq > self.original_max_seq_len:
+            scale = max_seq / self.original_max_seq_len
+            inv_freq = self.inv_freq / scale
+        else:
+            inv_freq = self.inv_freq
+        # Compute theta = position * frequency
+        # Flatten position_ids for batch processing
+        flat_position_ids = position_ids.reshape(-1).float()
+        # Element-wise outer product: [seq_len] x [dim/2] => [seq_len, dim/2]
+        freqs = flat_position_ids.unsqueeze(-1) * inv_freq.unsqueeze(0)
+        # Reshape to include batch dimension
+        freqs = freqs.reshape(batch_size, seq_len, -1)
+        # Now create interleaved pattern
+        emb = torch.cat([freqs, freqs], dim=-1)
+        # Compute cos and sin
+        cos = torch.cos(emb) * self.attention_scaling
+        sin = torch.sin(emb) * self.attention_scaling
+        return cos, sin
+# Rotates half the hidden dims of the input by swapping and negating dimensions.
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+# Apply rotary position embeddings to queries and keys.
+def apply_rotary_pos_embd(q, k, cos, sin, unsqueeze_dim=1):
+    # We need to make sure cos and sin can be properly broadcast
+    # to the shape of q and k by adding the heads dimension
+    cos = cos.unsqueeze(unsqueeze_dim)  # [batch_size, 1, seq_len, head_dim]
+    sin = sin.unsqueeze(unsqueeze_dim)  # [batch_size, 1, seq_len, head_dim]
+    # Apply complex multiplication:
+    # (q * cos) + (rotate_half(q) * sin)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L214
+# https://github.com/huggingface/smollm/blob/main/vision/m4/models/vllama3/modeling_vllama3.py#L382
+class LanguageModelGroupedQueryAttention(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.n_heads = cfg.lm_n_heads
+        self.n_kv_heads = cfg.lm_n_kv_heads
+        self.embd_dim = cfg.lm_hidden_dim
+        self.dropout = cfg.lm_dropout
+        assert self.n_heads % self.n_kv_heads == 0, "n_heads must be divisible by n_kv_heads"
+        assert self.embd_dim % self.n_heads == 0, "embd_dim must be divisible by num_heads"
+        self.n_kv_groups = self.n_heads // self.n_kv_heads
+        self.head_dim = self.embd_dim // self.n_heads
+        self.q_proj = nn.Linear(self.embd_dim, self.embd_dim, bias=False)
+        self.k_proj = nn.Linear(self.embd_dim, self.head_dim * self.n_kv_heads, bias=False)
+        self.v_proj = nn.Linear(self.embd_dim, self.head_dim * self.n_kv_heads, bias=False)
+        self.out_proj = nn.Linear(self.embd_dim, self.embd_dim, bias=False)
+        self.attn_dropout = nn.Dropout(self.dropout)
+        self.resid_dropout = nn.Dropout(self.dropout)
+        # Use scaled dot product attention if available
+        self.sdpa = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if not self.sdpa:
+            print("Warning: scaled dot product attention not available, using standard attention in LM.")
+    def forward(self, x, cos, sin, attention_mask=None):
+        B, T, C = x.size()
+        q = self.q_proj(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)  # (B, n_heads, T, head_dim)
+        k = self.k_proj(x).view(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2)  # (B, n_kv_heads, T, head_dim)
+        v = self.v_proj(x).view(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2)  # (B, n_kv_heads, T, head_dim)
+        # Use precomputed positional embeddings
+        q, k = apply_rotary_pos_embd(q, k, cos, sin)
+        k = k.repeat_interleave(self.n_kv_groups, dim=1)
+        v = v.repeat_interleave(self.n_kv_groups, dim=1)
+        # Process attention mask if provided
+        if attention_mask is not None:
+            # Create a 4D attention mask [batch_size, 1, 1, seq_length], In this format, 1 = attend, 0 = mask
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # [B, 1, 1, T]
+            padding_mask = (attention_mask == 0).transpose(-1, -2) # Use this for the manual path
+            # Convert to attention mask where 0 keeps values and -inf masks
+            attention_mask = (1.0 - attention_mask) * torch.finfo(q.dtype).min
+        if self.sdpa:
+            y = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=attention_mask,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=True # LM attention is causal (masked)
+            )
+        else:
+            attn = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(self.head_dim)
+            causal_mask = torch.tril(torch.ones(T, T, device=x.device)).view(1, 1, T, T)
+            attn = attn.masked_fill(causal_mask == 0, float('-inf'))
+            if attention_mask is not None:
+                attn = attn + attention_mask
+            attn = F.softmax(attn, dim=-1)
+            attn = self.attn_dropout(attn)
+            y = attn @ v
+            if attention_mask is not None:
+                y = y.masked_fill(padding_mask, 0.0) # Zero out the padded positions in the output
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.out_proj(y)
+        y = self.resid_dropout(y)
+        return y
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L160
+class LanguageModelMLP(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.embd_dim = cfg.lm_hidden_dim
+        self.inter_dim = cfg.lm_inter_dim
+        self.activation_fn = F.silu
+        self.gate_proj = nn.Linear(self.embd_dim, self.inter_dim, bias=False)
+        self.up_proj = nn.Linear(self.embd_dim, self.inter_dim, bias=False)
+        self.down_proj = nn.Linear(self.inter_dim, self.embd_dim, bias=False)
+    def forward(self, x):
+        gate = self.activation_fn(self.gate_proj(x))
+        x = self.up_proj(x)
+        x = self.down_proj(gate * x)
+        return x
+# https://github.com/meta-llama/llama3/blob/main/llama/model.py#L222
+class LanguageModelBlock(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.mlp = LanguageModelMLP(cfg)
+        self.attn = LanguageModelGroupedQueryAttention(cfg)
+        self.norm1 = RMSNorm(cfg) # Input Norm
+        self.norm2 = RMSNorm(cfg) # Post Attention Norm
+    def forward(self, x, cos, sin, attention_mask=None):
+        res = x
+        x = self.norm1(x)
+        x = self.attn(x, cos, sin, attention_mask)
+        x = res + x
+        res = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = res + x
+        return x
+# https://github.com/meta-llama/llama3/blob/main/llama/model.py#L251
+class LanguageModel(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.lm_use_tokens = cfg.lm_use_tokens
+        self.lm_tie_weights = cfg.lm_tie_weights
+        self.token_embedding = nn.Embedding(cfg.lm_vocab_size, cfg.lm_hidden_dim)
+        self.rotary_embd = RotaryEmbedding(cfg)
+        self.blocks = nn.ModuleList([
+            LanguageModelBlock(cfg) for _ in range(cfg.lm_n_blocks)
+        ])
+        self.norm = RMSNorm(cfg) # Final Norm
+        self.head = nn.Linear(cfg.lm_hidden_dim, cfg.lm_vocab_size, bias=False)
+        if self.lm_tie_weights:
+            self.head.weight = self.token_embedding.weight
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+        elif isinstance(module, RMSNorm):
+            module.weight.data.fill_(1.0)
+    def forward(self, x, attention_mask=None):
+        if self.lm_use_tokens:
+            x = self.token_embedding(x) # Only embed the inputs when using tokens
+        B , T, _ = x.size()
+        # Note: You could also cache these input embeddings if you want to avoid recomputing them
+        position_ids = torch.arange(T, device=x.device).unsqueeze(0).expand(B, -1) # Create position ids [0, 1, 2, ..., seq_len-1]
+        cos, sin = self.rotary_embd(position_ids) # Get rotary position embeddings
+        for block in self.blocks:
+            x = block(x, cos, sin, attention_mask)
+        x = self.norm(x)
+        if self.lm_use_tokens:
+            x = self.head(x) # Compute logits if we are using tokens, otherwise stay in the embedding space
+        return x
+    @torch.no_grad()
+    def generate(self, inputs, max_new_tokens=20):
+        # Add batch dimension if needed
+        if inputs.dim() == 1:
+            inputs = inputs.unsqueeze(0)
+        generated = inputs.clone()
+        for _ in range(max_new_tokens):
+            # Forward pass through the model
+            outputs = self.forward(generated)
+            last_output = outputs[:, -1, :]
+            if self.lm_use_tokens:
+                # Now the model outputs logits
+                next_token = torch.argmax(last_output, dim=-1, keepdim=True)
+                generated = torch.cat((generated, next_token), dim=-1)
+            else:
+                # Now the model outputs embeddings
+                next_token_embedding = last_output.unsqueeze(1)  # Shape: [batch_size, 1, hidden_dim]
+                generated = torch.cat((generated, next_token_embedding), dim=1)
+            #Note: You could enable the generation to break earlier than max_new_tokens when it detects a eos token, but this does not work in batched generation (output tensors need to have the same size)
+        return generated
+    # Load the model from a pretrained HuggingFace model (we don't want to have to train the Language Backbone from scratch)
+    @classmethod
+    def from_pretrained(cls, cfg):
+        from transformers import AutoConfig
+        from huggingface_hub import hf_hub_download
+        import safetensors
+        import torch.nn.init as init
+        # Load the HuggingFace config
+        hf_config = AutoConfig.from_pretrained(cfg.lm_model_type)
+        # Store original HF vocab size before we modify it
+        original_vocab_size = hf_config.vocab_size
+        # print(f"Original vocabulary size from pretrained model: {original_vocab_size}")
+        # Configure model parameters from HF config
+        cfg.lm_hidden_dim = hf_config.hidden_size
+        cfg.lm_inter_dim = hf_config.intermediate_size
+        cfg.lm_rms_eps = hf_config.rms_norm_eps
+        cfg.lm_re_base = hf_config.rope_theta
+        cfg.lm_max_position_embeddings = hf_config.max_position_embeddings
+        # We're keeping our own vocab size in cfg, but checking it's larger than original
+        if hasattr(cfg, 'lm_vocab_size'):
+            if cfg.lm_vocab_size < original_vocab_size:
+                raise ValueError(f"Config vocab size ({cfg.lm_vocab_size}) is smaller than pretrained model vocab size ({original_vocab_size})")
+            # print(f"Using vocabulary size: {cfg.lm_vocab_size}")
+        else:
+            # If not specified, use the original
+            cfg.lm_vocab_size = original_vocab_size
+            # print(f"Using original vocabulary size: {cfg.lm_vocab_size}")
+        cfg.lm_n_heads = hf_config.num_attention_heads
+        cfg.lm_n_kv_heads = hf_config.num_key_value_heads
+        cfg.lm_dropout = hf_config.attention_dropout
+        cfg.lm_n_blocks = hf_config.num_hidden_layers
+        # Create our model with potentially larger vocabulary
+        model = cls(cfg)
+        safetensors_file = hf_hub_download(repo_id=cfg.lm_model_type, filename="model.safetensors")
+        sd = model.state_dict()
+        mapping = {
+            'model.embed_tokens.weight': 'token_embedding.weight',
+            'model.norm.weight': 'norm.weight'
+        }
+        for i in range(cfg.lm_n_blocks):
+            layer_prefix = f'model.layers.{i}.'
+            block_prefix = f'blocks.{i}.'
+            mapping.update({
+                f"{layer_prefix}self_attn.q_proj.weight": f"{block_prefix}attn.q_proj.weight",
+                f"{layer_prefix}self_attn.k_proj.weight": f"{block_prefix}attn.k_proj.weight",
+                f"{layer_prefix}self_attn.v_proj.weight": f"{block_prefix}attn.v_proj.weight",
+                f"{layer_prefix}self_attn.o_proj.weight": f"{block_prefix}attn.out_proj.weight",
+                f"{layer_prefix}mlp.gate_proj.weight": f"{block_prefix}mlp.gate_proj.weight",
+                f"{layer_prefix}mlp.up_proj.weight": f"{block_prefix}mlp.up_proj.weight",
+                f"{layer_prefix}mlp.down_proj.weight": f"{block_prefix}mlp.down_proj.weight",
+                f"{layer_prefix}input_layernorm.weight": f"{block_prefix}norm1.weight",
+                f"{layer_prefix}post_attention_layernorm.weight": f"{block_prefix}norm2.weight"
+            })
+        # Special handling for token embeddings with extended vocabulary
+        has_extended_embeddings = False
+        with safetensors.safe_open(filename=safetensors_file, framework="pt", device="cpu") as f:
+            for hf_key, our_key in mapping.items():
+                if hf_key in f.keys() and our_key in sd:
+                    tensor = f.get_tensor(hf_key)
+                    # Special handling for token embeddings if vocab sizes differ
+                    if hf_key == 'model.embed_tokens.weight' and tensor.shape[0] != sd[our_key].shape[0]:
+                        has_extended_embeddings = True
+                        print(f"Extending token embeddings from {tensor.shape} to {sd[our_key].shape}")
+                        # Copy existing embeddings to the beginning of our larger embedding matrix
+                        sd[our_key][:tensor.shape[0]].copy_(tensor)
+                        # Initialize the new embeddings using the same approach as the original model
+                        std = 0.02  # Common value, but you might want to adjust based on model
+                        init.normal_(sd[our_key][tensor.shape[0]:], mean=0.0, std=std)
+                        print(f"Initialized {sd[our_key].shape[0] - tensor.shape[0]} new token embeddings")
+                        sd['head.weight'].copy_(sd[our_key])  # Update the head weights as well
+                    elif tensor.shape == sd[our_key].shape:
+                        sd[our_key].copy_(tensor)
+                    else:
+                        print(f"Shape mismatch for {hf_key} -> {our_key}: {tensor.shape} vs {sd[our_key].shape}")
+                else:
+                    if hf_key not in f.keys():
+                        print(f"Warning: Key {hf_key} not found in safetensors file")
+                    if our_key not in sd:
+                        print(f"Warning: Key {our_key} not found in model state dict")
+        # Load the state dict
+        model.load_state_dict(sd)
+        # Handle output projection / language modeling head
+        if has_extended_embeddings and hasattr(model, 'head') and 'head.weight' in sd:
+            # If we have a separate output projection layer and extended the vocab
+            # we should handle it similarly to the input embeddings
+            with safetensors.safe_open(filename=safetensors_file, framework="pt", device="cpu") as f:
+                if 'lm_head.weight' in f.keys():
+                    lm_head = f.get_tensor('lm_head.weight')
+                    if lm_head.shape[0] != sd['head.weight'].shape[0]:
+                        print(f"Extending LM head from {lm_head.shape} to {sd['head.weight'].shape}")
+                        # Copy existing weights
+                        sd['head.weight'][:lm_head.shape[0]].copy_(lm_head)
+                        # Initialize new weights
+                        std = 0.02
+                        init.normal_(sd['head.weight'][lm_head.shape[0]:], mean=0.0, std=std)
+                        # Load updated weights
+                        model.load_state_dict(sd)
+        # Handle weight tying (if needed)
+        if cfg.lm_tie_weights and hasattr(model, 'head') and hasattr(model, 'token_embedding'):
+            model.head.weight = model.token_embedding.weight
+            # print("Tied token embedding and LM head weights")
+        print(f"Successfully loaded {cfg.lm_model_type} weights from safetensors. Model has {sum(p.numel() for p in model.parameters()):,} parameters.")
+        return model

models/modality_projector.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Modality Projection from Vision to Language
+import torch.nn as nn
+class ModalityProjector(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.input_dim = cfg.vit_hidden_dim * (cfg.mp_pixel_shuffle_factor**2)
+        self.output_dim = cfg.lm_hidden_dim
+        self.scale_factor = cfg.mp_pixel_shuffle_factor
+        self.proj = nn.Linear(self.input_dim, self.output_dim, bias=False)
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(self.proj.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+    # https://github.com/huggingface/smollm/blob/main/vision/m4/models/vllama3/modeling_vllama3.py#L1281
+    def pixel_shuffle(self, x):
+        bsz, seq, embed_dim = x.size()
+        seq_root = int(seq**0.5)
+        assert seq_root**2 == seq # Sequence length must be a perfect square for pixel shuffle
+        assert seq_root % self.scale_factor == 0 # Sequence root must be divisible by scale factor
+        height = width = seq_root
+        x = x.view(bsz, height, width, embed_dim)
+        h_out = height // self.scale_factor
+        w_out = width // self.scale_factor
+        x = x.reshape(bsz, h_out, self.scale_factor, w_out, self.scale_factor, embed_dim)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+        x = x.reshape(bsz, h_out * w_out, embed_dim * self.scale_factor**2)
+        return x
+    def forward(self, x):
+        x = self.pixel_shuffle(x)
+        x = self.proj(x)
+        return x

models/utils.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import re
+# Used to check our models performance on multiple choice tasks. This can also be done in a more involved way with e.g. LLM-as-a-judge
+def check_multiple_choice_with_regex(model_outputs, correct_answers):
+    results = []
+    for model_output, correct_answer in zip(model_outputs, correct_answers):
+        correct_answer = correct_answer.upper()
+        # Look for the answer letter at the beginning of a line or as the last word
+        patterns = [
+            rf"\b{correct_answer}\b",  # Word boundary around the answer letter
+            rf"\b{correct_answer}[.,)]",  # Answer followed by punctuation
+            rf"\(.*{correct_answer}.*\)",  # Answer within parentheses
+        ]
+        match_found = False
+        for pattern in patterns:
+            if re.search(pattern, model_output):
+                match_found = True
+                break  # Exit inner loop once a match is found
+        results.append(match_found)
+    return results

models/vision_language_model.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import json
+import os
+import tempfile
+from dataclasses import asdict
+from typing import Optional
+from models.vision_transformer import ViT
+from models.language_model import LanguageModel
+from models.modality_projector import ModalityProjector
+from models.config import VLMConfig
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from safetensors.torch import load_model, save_model
+class VisionLanguageModel(nn.Module):
+    def __init__(self, cfg: VLMConfig, load_backbone=True):
+        super().__init__()
+        self.cfg = cfg
+        if load_backbone:
+            print("Loading from backbone weights")
+            self.vision_encoder = ViT.from_pretrained(cfg)
+            self.decoder = LanguageModel.from_pretrained(cfg)
+        else:
+            self.vision_encoder = ViT(cfg)
+            self.decoder = LanguageModel(cfg)
+        self.MP = ModalityProjector(cfg)
+        self.load_backbone = load_backbone
+    def forward(self, input_ids, image, attention_mask=None, targets=None):
+        image_embd = self.vision_encoder(image)
+        image_embd = self.MP(image_embd)
+        token_embd = self.decoder.token_embedding(input_ids)
+        combined_embd = torch.cat((image_embd, token_embd), dim=1) # Concatenate image embeddings to token embeddings
+        # Adjust attention mask to account for image tokens
+        if attention_mask is not None:
+            # Create mask of 1s for image tokens (all image tokens should be attended to)
+            batch_size = image_embd.size(0)
+            img_seq_len = image_embd.size(1)
+            image_attention_mask = torch.ones((batch_size, img_seq_len), device=attention_mask.device, dtype=attention_mask.dtype)
+            # Combine image and token attention masks
+            attention_mask = torch.cat((image_attention_mask, attention_mask), dim=1)
+        logits = self.decoder(combined_embd, attention_mask) # Not logits yet, but easier to return like this
+        loss = None
+        if targets is not None:
+            # Only use the token part of the logits for loss computation
+            logits = self.decoder.head(logits)
+            logits = logits[:, image_embd.size(1):, :]
+            loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=-100)
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, input_ids, image, attention_mask=None, max_new_tokens=5):
+        # Process image through vision encoder and projection
+        image_embd = self.vision_encoder(image)
+        image_embd = self.MP(image_embd)
+        # Embed initial tokens
+        token_embd = self.decoder.token_embedding(input_ids)
+        # Concatenate image embeddings with token embeddings
+        combined_embd = torch.cat((image_embd, token_embd), dim=1)
+        batch_size = image_embd.size(0)
+        img_seq_len = image_embd.size(1)
+        # Adjust attention mask to account for image tokens
+        if attention_mask is not None:
+            # Create mask of 1s for image tokens (all image tokens should be attended to)
+            image_attention_mask = torch.ones((batch_size, img_seq_len), device=attention_mask.device, dtype=attention_mask.dtype)
+            attention_mask = torch.cat((image_attention_mask, attention_mask), dim=1)
+        # Generate from combined embeddings using the decoder
+        # We need to use the decoder's forward function and not its generate method
+        # because we want to keep track of the image prefix
+        outputs = combined_embd
+        generated_tokens = torch.zeros((batch_size, max_new_tokens), device=input_ids.device, dtype=input_ids.dtype)
+        #Note: Here you could implement improvements like e.g. KV caching
+        for i in range(max_new_tokens):
+            model_out = self.decoder(outputs, attention_mask)
+            # Get predictions for the last token only (normally this is the embedding, not the logits)
+            last_token_logits = model_out[:, -1, :]
+            # Apply head to get logits (if model is in embedding mode)
+            if not self.decoder.lm_use_tokens:
+                last_token_logits = self.decoder.head(last_token_logits)
+            probs = torch.softmax(last_token_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            generated_tokens[:, i] = next_token.squeeze(-1)
+            # Convert to embedding and append
+            next_embd = self.decoder.token_embedding(next_token)
+            outputs = torch.cat((outputs, next_embd), dim=1)
+            if attention_mask is not None:
+                attention_mask = torch.cat((attention_mask, torch.ones((batch_size, 1), device=attention_mask.device)), dim=1)
+        return generated_tokens
+    @classmethod
+    def from_pretrained(
+        cls, repo_id_or_path: str, *, revision: Optional[str] = None
+    ) -> "VisionLanguageModel":
+        """
+        Load a VisionLanguageModel from a local directory or a repo on the Hugging Face Hub.
+        Args:
+            repo_id_or_path (str): The path to the local directory or the Hugging Face Hub repo ID.
+        Returns:
+            VisionLanguageModel: The loaded model.
+        """
+        # If local folder exists => load from there
+        if os.path.exists(repo_id_or_path):
+            config_path = os.path.join(repo_id_or_path, "config.json")
+            weights_path = os.path.join(repo_id_or_path, "model.safetensors")
+            if not os.path.exists(config_path):
+                raise ValueError(
+                    f"Config file not found at {config_path}. Please provide a valid path."
+                )
+            if not os.path.exists(weights_path):
+                raise ValueError(
+                    f"Weights file not found at {weights_path}. Please provide a valid path."
+                )
+        # Otherwise, assume it's a Hugging Face Hub repo
+        else:
+            from huggingface_hub import hf_hub_download
+            config_path = hf_hub_download(
+                repo_id=repo_id_or_path, filename="config.json", revision=revision
+            )
+            weights_path = hf_hub_download(
+                repo_id=repo_id_or_path, filename="model.safetensors", revision=revision
+            )
+        # Load config
+        with open(config_path, "r") as f:
+            cfg = VLMConfig(**json.load(f))
+        # Initialize model without loading the backbone
+        model = cls(cfg, load_backbone=False)
+        # Load safetensors weights
+        load_model(model, weights_path)
+        # Done!
+        return model
+    def save_pretrained(self, save_directory: str) -> None:
+        """
+        Save the model and configuration to a directory.
+        Args:
+            save_directory (str): The directory to save the model and config.
+        """
+        # Create directory if it doesn't exist
+        os.makedirs(save_directory, exist_ok=True)
+        # Save config
+        with open(os.path.join(save_directory, "config.json"), "w") as f:
+            f.write(json.dumps(asdict(self.cfg), indent=4))
+        # Save weights as safetensors
+        save_model(self, os.path.join(save_directory, "model.safetensors"))
+    def push_to_hub(self, repo_id: str, private: bool = False) -> None:
+        """
+        Push the model and configuration to the Hugging Face Hub.
+        Args:
+            repo_id (str): The repo ID on the Hugging Face Hub.
+        """
+        from huggingface_hub import create_repo, upload_folder
+        # Create repo
+        repo_url = create_repo(repo_id=repo_id, private=private, exist_ok=True)
+        repo_id = repo_url.repo_id
+        print("Created repo: ", repo_url)
+        with tempfile.TemporaryDirectory() as save_path:
+            # Save to tmp directory
+            self.save_pretrained(save_path)
+            # Save model card
+            with open(os.path.join(save_path, "README.md"), "w") as f:
+                f.write(MODEL_CARD_TEMPLATE.format(repo_id=repo_id))
+            # Upload
+            return upload_folder(
+                repo_id=repo_id,
+                repo_type="model",
+                folder_path=save_path,
+                commit_message="Upload nanoVLM using push_to_hub",
+            )
+MODEL_CARD_TEMPLATE = """
+---
+# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
+# Doc / guide: https://huggingface.co/docs/hub/model-cards
+library_name: nanovlm
+license: mit
+pipeline_tag: image-text-to-text
+tags:
+  - vision-language
+  - multimodal
+  - research
+---
+**nanoVLM** is a minimal and lightweight Vision-Language Model (VLM) designed for efficient training and experimentation. Built using pure PyTorch, the entire model architecture and training logic fits within ~750 lines of code. It combines a ViT-based image encoder (SigLIP-B/16-224-85M) with a lightweight causal language model (SmolLM2-135M), resulting in a compact 222M parameter model.
+For more information, check out the base model on https://huggingface.co/lusxvr/nanoVLM-222M.
+**Usage:**
+Clone the nanoVLM repository: https://github.com/huggingface/nanoVLM.
+Follow the install instructions and run the following code:
+```python
+from models.vision_language_model import VisionLanguageModel
+model = VisionLanguageModel.from_pretrained("{repo_id}")
+```
+"""

models/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/siglip/modeling_siglip.py#L245
+class ViTPatchEmbeddings(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.img_size = cfg.vit_img_size
+        self.patch_size = cfg.vit_patch_size
+        self.num_patches = (self.img_size // self.patch_size) ** 2
+        self.cls_flag = cfg.vit_cls_flag
+        self.embd_dim = cfg.vit_hidden_dim
+        # Conv layer to extract the patches
+        self.conv = nn.Conv2d(
+            in_channels=3,
+            out_channels=self.embd_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        if self.cls_flag:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embd_dim))
+            self.position_embedding = nn.Parameter(torch.rand(1, self.num_patches + 1, self.embd_dim))
+        else:
+            self.position_embedding = nn.Parameter(torch.rand(1, self.num_patches, self.embd_dim))
+    def forward(self, x):
+        x = self.conv(x)  # extract patches
+        x = x.flatten(2)  # flatten the patches into a single dimension
+        x = x.transpose(1, 2)  # transpose to (batch_size, num_patches, hidden_dim)
+        # Add CLS token (according to original ViT Paper) and position embeddings
+        if self.cls_flag:
+            cls_token = self.cls_token.expand(x.shape[0], -1, -1)
+            x = torch.cat((cls_token, x), dim=1)
+        x = x + self.position_embedding
+        return x
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/siglip/modeling_siglip.py#L381
+# https://github.com/karpathy/nanoGPT/blob/master/model.py#L29
+class ViTMultiHeadAttention(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.n_heads = cfg.vit_n_heads
+        self.embd_dim = cfg.vit_hidden_dim
+        assert self.embd_dim % self.n_heads == 0, "embd_dim must be divisible by num_heads"
+        self.head_dim = self.embd_dim // self.n_heads
+        self.dropout = cfg.vit_dropout
+        # Combined projections for all heads
+        self.qkv_proj = nn.Linear(self.embd_dim, 3 * self.embd_dim, bias=True)
+        self.out_proj = nn.Linear(self.embd_dim, self.embd_dim, bias=True)
+        # Dropout layers
+        self.attn_dropout = nn.Dropout(self.dropout)
+        self.resid_dropout = nn.Dropout(self.dropout)
+        # Use scaled dot product attention if available
+        self.sdpa = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if not self.sdpa:
+            print("Warning: scaled dot product attention not available. Using standard attention in ViT.")
+    def forward(self, x):
+        B, T, C = x.size()
+        qkv = self.qkv_proj(x)
+        q, k, v = qkv.split(C, dim=2)
+        # Reshape  [B, T, C] -> [B, T, n_heads, head_dim] and transpose -> [B, n_heads, T, head_dim]
+        q = q.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)  # (B, n_heads, T, head_dim)
+        k = k.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)  # (B, n_heads, T, head_dim)
+        v = v.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)  # (B, n_heads, T, head_dim)
+        if self.sdpa:
+            y = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=None,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=False # ViT attention is bidirectional
+            )
+        else:
+            attn = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            attn = F.softmax(attn, dim=-1)
+            attn = self.attn_dropout(attn)
+            y = attn @ v  # (B, n_heads, T, T) x (B, n_heads, T, head_dim) -> (B, n_heads, T, head_dim)
+        # Transpose back from [B, n_heads, T, head_dim] to [B, T, n_heads * head_dim] and combine all heads to [B, T, C]
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.out_proj(y)
+        y = self.resid_dropout(y)
+        return y
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/siglip/modeling_siglip.py#L453
+class ViTMLP(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.activation_fn = nn.GELU(approximate='tanh')
+        self.fc1 = nn.Linear(cfg.vit_hidden_dim, cfg.vit_inter_dim)
+        self.fc2 = nn.Linear(cfg.vit_inter_dim, cfg.vit_hidden_dim)
+        self.dropout = nn.Dropout(cfg.vit_dropout)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.activation_fn(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+# https://github.com/karpathy/nanoGPT/blob/master/model.py#L94
+class ViTBlock(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(cfg.vit_hidden_dim, eps=cfg.vit_ln_eps)
+        self.attn = ViTMultiHeadAttention(cfg)
+        self.ln2 = nn.LayerNorm(cfg.vit_hidden_dim, eps=cfg.vit_ln_eps)
+        self.mlp = ViTMLP(cfg)
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class ViT(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.patch_embedding = ViTPatchEmbeddings(cfg)
+        self.cls_flag = cfg.vit_cls_flag
+        self.dropout = nn.Dropout(cfg.vit_dropout)
+        self.blocks = nn.ModuleList([ViTBlock(cfg) for _ in range(cfg.vit_n_blocks)])
+        self.layer_norm = nn.LayerNorm(cfg.vit_hidden_dim, eps=cfg.vit_ln_eps)
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv2d):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+    def forward(self, x):
+        x = self.patch_embedding(x)
+        x = self.dropout(x)
+        for block in self.blocks:
+            x = block(x)
+        if self.cls_flag:
+            x = self.layer_norm(x[:, 0])
+        else:
+            x = self.layer_norm(x)
+            #x = x.mean(dim=1)
+        return x
+    # Load the model from a pretrained HuggingFace model (we don't want to have to train the Vision Backbone from scratch)
+    @classmethod
+    def from_pretrained(cls, cfg):
+        from transformers import SiglipVisionConfig
+        from huggingface_hub import hf_hub_download
+        import safetensors
+        hf_config = SiglipVisionConfig.from_pretrained(cfg.vit_model_type)
+        cfg.vit_dropout=hf_config.attention_dropout
+        cfg.vit_hidden_dim=hf_config.hidden_size
+        cfg.vit_img_size=hf_config.image_size
+        cfg.vit_inter_dim=hf_config.intermediate_size
+        cfg.vit_ln_eps=hf_config.layer_norm_eps
+        cfg.vit_n_heads=hf_config.num_attention_heads
+        cfg.vit_n_blocks=hf_config.num_hidden_layers
+        cfg.vit_patch_size=hf_config.patch_size
+        model = cls(cfg)
+        safetensors_file = hf_hub_download(repo_id=cfg.vit_model_type, filename="model.safetensors")
+        sd = model.state_dict()
+        mapping = {
+            'vision_model.embeddings.patch_embedding.weight': 'patch_embedding.conv.weight',
+            'vision_model.embeddings.patch_embedding.bias': 'patch_embedding.conv.bias',
+            'vision_model.embeddings.position_embedding.weight': 'patch_embedding.position_embedding',
+            'vision_model.post_layernorm.weight': 'layer_norm.weight',
+            'vision_model.post_layernorm.bias': 'layer_norm.bias',
+        }
+        for i in range(cfg.vit_n_blocks):
+            # Layer norms
+            mapping[f'vision_model.encoder.layers.{i}.layer_norm1.weight'] = f'blocks.{i}.ln1.weight'
+            mapping[f'vision_model.encoder.layers.{i}.layer_norm1.bias'] = f'blocks.{i}.ln1.bias'
+            mapping[f'vision_model.encoder.layers.{i}.layer_norm2.weight'] = f'blocks.{i}.ln2.weight'
+            mapping[f'vision_model.encoder.layers.{i}.layer_norm2.bias'] = f'blocks.{i}.ln2.bias'
+            # MLP
+            mapping[f'vision_model.encoder.layers.{i}.mlp.fc1.weight'] = f'blocks.{i}.mlp.fc1.weight'
+            mapping[f'vision_model.encoder.layers.{i}.mlp.fc1.bias'] = f'blocks.{i}.mlp.fc1.bias'
+            mapping[f'vision_model.encoder.layers.{i}.mlp.fc2.weight'] = f'blocks.{i}.mlp.fc2.weight'
+            mapping[f'vision_model.encoder.layers.{i}.mlp.fc2.bias'] = f'blocks.{i}.mlp.fc2.bias'
+            # Output projection
+            mapping[f'vision_model.encoder.layers.{i}.self_attn.out_proj.weight'] = f'blocks.{i}.attn.out_proj.weight'
+            mapping[f'vision_model.encoder.layers.{i}.self_attn.out_proj.bias'] = f'blocks.{i}.attn.out_proj.bias'
+        with safetensors.safe_open(filename=safetensors_file, framework="pt", device="cpu") as f:
+            for hf_key, our_key in mapping.items():
+                if hf_key in f.keys() and our_key in sd:
+                    tensor = f.get_tensor(hf_key)
+                    if tensor.shape == sd[our_key].shape:
+                        sd[our_key].copy_(tensor)
+                    else:
+                        if 'position_embedding' in hf_key:
+                            sd[our_key].copy_(tensor.unsqueeze(0))
+                        else:
+                            print(f"Shape mismatch for {hf_key} -> {our_key}: {tensor.shape} vs {sd[our_key].shape}")
+                else:
+                    if hf_key not in f.keys():
+                        print(f"Warning: Key {hf_key} not found in safetensors file")
+                    if our_key not in sd:
+                        print(f"Warning: Key {our_key} not found in model state dict")
+            # Manually handle QKV concatenation since our implementation combines Q, K, V into one
+            for i in range(model.cfg.vit_n_blocks):
+                q_weight = f.get_tensor(f'vision_model.encoder.layers.{i}.self_attn.q_proj.weight')
+                k_weight = f.get_tensor(f'vision_model.encoder.layers.{i}.self_attn.k_proj.weight')
+                v_weight = f.get_tensor(f'vision_model.encoder.layers.{i}.self_attn.v_proj.weight')
+                qkv_weight = torch.cat((q_weight, k_weight, v_weight), dim=0)
+                sd[f'blocks.{i}.attn.qkv_proj.weight'].copy_(qkv_weight)
+                q_bias = f.get_tensor(f'vision_model.encoder.layers.{i}.self_attn.q_proj.bias')
+                k_bias = f.get_tensor(f'vision_model.encoder.layers.{i}.self_attn.k_proj.bias')
+                v_bias = f.get_tensor(f'vision_model.encoder.layers.{i}.self_attn.v_proj.bias')
+                qkv_bias = torch.cat((q_bias, k_bias, v_bias), dim=0)
+                sd[f'blocks.{i}.attn.qkv_proj.bias'].copy_(qkv_bias)
+        model.load_state_dict(sd)
+        print(f"Successfully loaded {cfg.vit_model_type} weights from safetensors. Model has {sum(p.numel() for p in model.parameters()):,} parameters.")
+        return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,80 @@

+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.9.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.1.8
+fastapi==0.115.12
+ffmpy==0.5.0
+filelock==3.18.0
+fsspec==2025.3.2
+gradio==5.29.1
+gradio-client==1.10.1
+groovy==0.1.2
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.31.2
+idna==3.10
+jinja2==3.1.6
+markdown-it-py==3.0.0
+markupsafe==3.0.2
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.4.2
+numpy==2.2.5
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+orjson==3.10.18
+packaging==25.0
+pandas==2.2.3
+pillow==11.2.1
+psutil==5.9.8
+pydantic==2.11.4
+pydantic-core==2.33.2
+pydub==0.25.1
+pygments==2.19.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.2
+pyyaml==6.0.2
+regex==2024.11.6
+requests==2.32.3
+rich==14.0.0
+ruff==0.11.10
+safehttpx==0.1.6
+safetensors==0.5.3
+semantic-version==2.10.0
+setuptools==80.7.1
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+spaces==0.36.0
+starlette==0.46.2
+sympy==1.14.0
+tokenizers==0.21.1
+tomlkit==0.13.2
+torch==2.7.0
+torchvision==0.22.0
+tqdm==4.67.1
+transformers==4.51.3
+triton==3.3.0
+typer==0.15.4
+typing-extensions==4.13.2
+typing-inspection==0.4.0
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.2
+websockets==15.0.1