nomic-ai
/

nomic-bert-2048

@@ -3,24 +3,26 @@
 # https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
 # https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py
 import logging
 # Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
 import math
-import numpy as np
-import collections
 import os
 import re
 from collections import OrderedDict
 from functools import partial
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from safetensors.torch import load_file as safe_load_file
-from transformers import GPT2Config, PreTrainedModel, ViTModel, ViTConfig
 from transformers.models.bert.modeling_bert import (
     BaseModelOutputWithPoolingAndCrossAttentions,
     MaskedLMOutput,
@@ -28,11 +30,14 @@ from transformers.models.bert.modeling_bert import (
 )
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME
 from transformers.utils.hub import cached_file, get_checkpoint_shard_files
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from torch.nn.modules.utils import _pair
 from .configuration_hf_nomic_bert import NomicBertConfig
 logger = logging.getLogger(__name__)
@@ -66,9 +71,7 @@ def state_dict_from_pretrained(model_name, safe_serialization=False, device=None
     else:  # Try loading from HF hub instead of from local files
         resolved_archive_file = None
         for weight_name in [WEIGHTS_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME]:
-            resolved_archive_file = cached_file(
-                model_name, weight_name, _raise_exceptions_for_missing_entries=False
-            )
             if resolved_archive_file is not None:
                 if weight_name in [SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME]:
                     load_safe = True
@@ -273,18 +276,20 @@ def remap_bert_state_dict(
     return state_dict
 def _trunc_normal_(tensor, mean, std, a, b):
     # Cut & paste from PyTorch official master until it's in a few official releases - RW
     # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
     def norm_cdf(x):
         # Computes standard normal cumulative distribution function
-        return (1. + math.erf(x / math.sqrt(2.))) / 2.
     if (mean < a - 2 * std) or (mean > b + 2 * std):
-        print("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
-                      "The distribution of values may be incorrect.",
-                      stacklevel=2)
     # Values are generated by using a truncated uniform distribution and
     # then using the inverse CDF for the normal distribution.
@@ -301,14 +306,15 @@ def _trunc_normal_(tensor, mean, std, a, b):
     tensor.erfinv_()
     # Transform to proper mean, std
-    tensor.mul_(std * math.sqrt(2.))
     tensor.add_(mean)
     # Clamp to ensure it's in the proper range
     tensor.clamp_(min=a, max=b)
     return tensor
-def trunc_normal_tf_(tensor, mean=0., std=1., a=-2., b=2.):
     r"""Fills the input Tensor with values drawn from a truncated
     normal distribution. The values are effectively drawn from the
     normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
@@ -449,11 +455,13 @@ def _init_weights(module, initializer_range=0.02):
         if module.padding_idx is not None:
             nn.init.zeros_(module.weight[module.padding_idx])
 def _ntuple(n):
     def parse(x):
         if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
             return tuple(x)
         return tuple(repeat(x, n))
     return parse
@@ -481,7 +489,7 @@ def get_2d_sincos_pos_embed(embed_dim, grid_size, add_cls_token=False):
         position embeddings (with or without classification token)
     """
     grid_h = np.arange(grid_size, dtype=np.float32)
     grid_w = np.arange(grid_size, dtype=np.float32)
     grid = np.meshgrid(grid_w, grid_h)  # here w goes first
     grid = np.stack(grid, axis=0)
@@ -525,6 +533,7 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
     emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
     return emb
 def ndgrid(*tensors) -> Tuple[torch.Tensor, ...]:
     """generate N-D grid in dimension order.
@@ -548,18 +557,19 @@ def ndgrid(*tensors) -> Tuple[torch.Tensor, ...]:
         # the old behaviour of meshgrid was 'ij'
         return torch.meshgrid(*tensors)
 def build_fourier_pos_embed(
-        feat_shape: List[int],
-        bands: Optional[torch.Tensor] = None,
-        num_bands: int = 64,
-        max_res: int = 224,
-        temperature: float = 10000.,
-        linear_bands: bool = False,
-        include_grid: bool = False,
-        in_pixels: bool = True,
-        ref_feat_shape: Optional[List[int]] = None,
-        dtype: torch.dtype = torch.float32,
-        device: Optional[torch.device] = None,
 ) -> List[torch.Tensor]:
     """
@@ -601,7 +611,7 @@ def build_fourier_pos_embed(
             dtype = bands.dtype
     if in_pixels:
-        t = [torch.linspace(-1., 1., steps=s, device=device, dtype=torch.float32) for s in feat_shape]
     else:
         t = [torch.arange(s, device=device, dtype=torch.int64).to(torch.float32) for s in feat_shape]
@@ -619,16 +629,16 @@ def build_fourier_pos_embed(
 def build_rotary_pos_embed(
-        feat_shape: List[int],
-        bands: Optional[torch.Tensor] = None,
-        dim: int = 64,
-        max_res: int = 224,
-        temperature: float = 10000.,
-        linear_bands: bool = False,
-        in_pixels: bool = True,
-        ref_feat_shape: Optional[List[int]] = None,
-        dtype: torch.dtype = torch.float32,
-        device: Optional[torch.device] = None,
 ):
     """
@@ -666,22 +676,23 @@ def build_rotary_pos_embed(
     cos_emb = cos_emb.reshape(num_spatial_dim, -1).repeat_interleave(2, -1)
     return sin_emb, cos_emb
 def freq_bands(
-        num_bands: int,
-        temperature: float = 10000.,
-        step: int = 2,
-        device: Optional[torch.device] = None,
 ) -> torch.Tensor:
     exp = torch.arange(0, num_bands, step, dtype=torch.int64, device=device).to(torch.float32) / num_bands
-    bands = 1. / (temperature ** exp)
     return bands
 def pixel_freq_bands(
-        num_bands: int,
-        max_freq: float = 224.,
-        linear_bands: bool = True,
-        device: Optional[torch.device] = None,
 ):
     if linear_bands:
         bands = torch.linspace(1.0, max_freq / 2, num_bands, dtype=torch.float32, device=device)
@@ -689,18 +700,21 @@ def pixel_freq_bands(
         bands = 2 ** torch.linspace(0, math.log(max_freq, 2) - 1, num_bands, dtype=torch.float32, device=device)
     return bands * torch.pi
 def rot(x):
     return torch.stack([-x[..., 1::2], x[..., ::2]], -1).reshape(x.shape)
 def apply_rot_embed_cat(x: torch.Tensor, emb):
     sin_emb, cos_emb = emb.tensor_split(2, -1)
     if sin_emb.ndim == 3:
         return x * cos_emb.unsqueeze(1).expand_as(x) + rot(x) * sin_emb.unsqueeze(1).expand_as(x)
     return x * cos_emb + rot(x) * sin_emb
 # taken from https://github.com/huggingface/pytorch-image-models/blob/cb0e4391beedcc5ac3ae4bce16561b95c326f32c/timm/layers/pos_embed_sincos.py#L363
 class NomicVisionRotaryEmbeddingCat(nn.Module):
-    """ Rotary position embedding w/ concatenatd sin & cos
     The following impl/resources were referenced for this impl:
     * https://github.com/lucidrains/vit-pytorch/blob/6f3a5fcf0bca1c5ec33a35ef48d97213709df4ba/vit_pytorch/rvt.py
@@ -708,14 +722,14 @@ class NomicVisionRotaryEmbeddingCat(nn.Module):
     """
     def __init__(
-            self,
-            dim,
-            max_res=224,
-            temperature=10000,
-            in_pixels=True,
-            linear_bands: bool = False,
-            feat_shape: Optional[List[int]] = None,
-            ref_feat_shape: Optional[List[int]] = None,
     ):
         super().__init__()
         self.dim = dim
@@ -782,6 +796,7 @@ class NomicVisionRotaryEmbeddingCat(nn.Module):
         pos_embed = self.get_embed(x.shape[2:])
         return apply_rot_embed_cat(x, pos_embed)
 class NomicVisionPatchEmbeddings(nn.Module):
     def __init__(
         self,
@@ -803,13 +818,19 @@ class NomicVisionPatchEmbeddings(nn.Module):
         self.sinusoidal_pos_embedding = False
         self.no_embed_class = getattr(config, "no_embed_class", False)
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.n_embd)) if not getattr(config, "no_cls_token", False) else None
         if config.learned_pos_embedding:
             # this is the default in DINO
             self.learned_pos_embedding = True
             # hack for timm dinov2 with registers
             num_patches = self.num_patches if getattr(config, "register_tokens", 0) > 0 else self.num_patches + 1
-            self.pos_embed = nn.Parameter(torch.randn(1, num_patches, config.n_embd) * 0.02) if getattr(config, "use_pos_embed", True) else None
         elif getattr(config, "sinusoidal_pos_embedding", False):
             self.sinusoidal_pos_embedding = True
             if getattr(config, "use_pos_embed", True):
@@ -819,12 +840,16 @@ class NomicVisionPatchEmbeddings(nn.Module):
             else:
                 self.pos_embed = None
         else:
-            self.pos_embed = nn.Parameter(torch.randn(1, self.num_patches + 1, config.n_embd) * 0.02) if getattr(config, "use_pos_embed", True) else None
         if getattr(config, "register_tokens", 0) > 0:
             self.reg_token = nn.Parameter(torch.randn(1, config.register_tokens, config.n_embd) * 0.02)
         else:
-            self.reg_token = None
         if config.mask_token:
             self.mask_token = nn.Parameter(torch.zeros(1, config.n_embd))
@@ -843,7 +868,6 @@ class NomicVisionPatchEmbeddings(nn.Module):
         else:
             self.rope = None
     def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
         """
         This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
@@ -913,7 +937,7 @@ class NomicVisionPatchEmbeddings(nn.Module):
                 embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
             else:
                 if self.pos_embed is not None:
-                    embeddings = embeddings + self.pos_embed
             if to_cat:
                 embeddings = torch.cat(to_cat + [embeddings], dim=1)
         else:
@@ -924,7 +948,7 @@ class NomicVisionPatchEmbeddings(nn.Module):
                     embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
             else:
                 if self.pos_embed is not None:
-                    embeddings = embeddings + self.pos_embed
         embeddings = self.patch_dropout(embeddings)
@@ -1350,8 +1374,12 @@ class NomicBertAttention(nn.Module):
                 qkv = rearrange(qkv, "b h three s d -> b s three h d")
         elif rope is not None:
             q, k, v = qkv.permute(0, 3, 1, 2, 4).unbind(dim=-2)
-            q = torch.cat([q[:, :, :self.num_prefix_tokens], apply_rot_embed_cat(q[:, :, self.num_prefix_tokens:], rope)], dim=2).type_as(q)
-            k = torch.cat([k[:, :, :self.num_prefix_tokens], apply_rot_embed_cat(k[:, :, self.num_prefix_tokens:], rope)], dim=2).type_as(q)
             qkv = torch.stack([q, k, v], dim=-2)
             qkv = rearrange(qkv, "b h s three d -> b s three h d")
@@ -1361,15 +1389,20 @@ class NomicBertAttention(nn.Module):
         query = query.permute(0, 2, 1, 3)
         key = key.permute(0, 2, 1, 3)
         value = value.permute(0, 2, 1, 3)
-        attention_scores = torch.matmul(query, key.transpose(-1, -2)) / self.norm_factor
-        if attention_mask is not None:
-            attention_scores = attention_scores + attention_mask
-        attentions_probs = F.softmax(attention_scores, dim=-1)
-        attentions_probs = self.drop(attentions_probs)
-        attn_output = torch.matmul(attentions_probs, value)
         attn_output = rearrange(attn_output.permute(0, 2, 1, 3), "... h d -> ... (h d)")
         attn_output = self.out_proj(attn_output)
@@ -1807,6 +1840,7 @@ class NomicBertForSequenceClassification(NomicBertPreTrainedModel):
             attentions=outputs.attentions,
         )
 def hf_vit_config_to_vit_config(vit_config: ViTConfig) -> GPT2Config:
     return GPT2Config(
         n_embd=vit_config.hidden_size,
@@ -1814,7 +1848,7 @@ def hf_vit_config_to_vit_config(vit_config: ViTConfig) -> GPT2Config:
         n_head=vit_config.num_attention_heads,
         n_inner=vit_config.intermediate_size,
         activation_function=vit_config.hidden_act,
-        vocab_size=0, # no vocab since using patches
         n_positions=0,  # No absolute position embedding
         resid_pdrop=0.0,  # No dropout
         embd_pdrop=getattr(vit_config, "dropout", 0.0),
@@ -1850,15 +1884,12 @@ def hf_vit_config_to_vit_config(vit_config: ViTConfig) -> GPT2Config:
         mask_token=False,
         learned_pos_embedding=False,
         patch_dropout=0,
-        sinusoidal_pos_embedding=vit_config.model_type == "vit_mae"
     )
 class NomicAttentionPooling(nn.Module):
-    def __init__(
-        self,
-        config
-    ):
         super().__init__()
         self.embed_dim = config.n_embd
         self.use_flash_attn = config.use_flash_attn
@@ -1879,7 +1910,7 @@ class NomicAttentionPooling(nn.Module):
         self.Wq = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_proj_bias)
         self.Wkv = nn.Linear(self.embed_dim, kv_dim, bias=config.qkv_proj_bias)
         self.latent = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_proj_bias)
@@ -1887,7 +1918,7 @@ class NomicAttentionPooling(nn.Module):
         self.drop = nn.Dropout(config.attn_pdrop)
     def init_weights(self):
-        trunc_normal_tf_(self.latent, std=self.embed_dim ** -0.5)
     def forward(
         self,
@@ -1938,7 +1969,7 @@ class NomicAttentionPooling(nn.Module):
         return attn_output
 class NomicMultiHeadAttentionPooling(nn.Module):
     def __init__(
         self,
@@ -1993,15 +2024,16 @@ class NomicMultiHeadAttentionPooling(nn.Module):
         """
         attn_outputs = self.attn(
-                hidden_states,
-                attention_mask=attention_mask,
-            )
         normed = self.norm1(attn_outputs)
         hidden_states = hidden_states + self.mlp(normed)
         return hidden_states
 class NomicVisionPreTrainedModel(PreTrainedModel):
     """An abstract class to handle weights initialization and
     a simple interface for dowloading and loading pretrained models.
@@ -2025,6 +2057,7 @@ class NomicVisionPreTrainedModel(PreTrainedModel):
             )
         self.config = config
 class NomicVisionModel(NomicVisionPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -2035,7 +2068,9 @@ class NomicVisionModel(NomicVisionPreTrainedModel):
         self.selector = NomicMultiHeadAttentionPooling(config)
         self.global_pool = getattr(config, "global_pool", None)
-        self.num_prefix_tokens = (1 if not getattr(config, "no_cls_token", False) else 0) + getattr(config, "register_tokens", 0)
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
@@ -2052,20 +2087,22 @@ class NomicVisionModel(NomicVisionPreTrainedModel):
         original_dtype = embeddings.dtype
-        hidden_states = embeddings
         # unused but easier to pass to gradient checkpointing as words
         residual = None
         for layer in self.layers:
             # need to pass none for backwards compatability
-            hidden_states, _, residual = layer(hidden_states, None, residual=residual, is_padded_inputs=False, rope=rope)
         hidden_states = hidden_states + residual
         if self.global_pool == "avg":
-            hidden_states = hidden_states[:, self.num_prefix_tokens:].mean(dim=1)
         pooled_output = self.selector(hidden_states)
         return BaseModelOutputWithPast(
             last_hidden_state=pooled_output,
             hidden_states=hidden_states,
-        )

 # https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
 # https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py
+import collections
 import logging
 # Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
 import math
 import os
 import re
 from collections import OrderedDict
 from functools import partial
 from typing import List, Optional, Tuple, Union
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from safetensors.torch import load_file as safe_load_file
+from torch.nn.modules.utils import _pair
+from transformers import GPT2Config, PreTrainedModel, ViTConfig, ViTModel
+from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.models.bert.modeling_bert import (
     BaseModelOutputWithPoolingAndCrossAttentions,
     MaskedLMOutput,
 )
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME
 from transformers.utils.hub import cached_file, get_checkpoint_shard_files
 from .configuration_hf_nomic_bert import NomicBertConfig
+try:
+    from torch.nn.functional import scaled_dot_product_attention
+except ImportError:
+    scaled_dot_product_attention = None
 logger = logging.getLogger(__name__)
     else:  # Try loading from HF hub instead of from local files
         resolved_archive_file = None
         for weight_name in [WEIGHTS_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME]:
+            resolved_archive_file = cached_file(model_name, weight_name, _raise_exceptions_for_missing_entries=False)
             if resolved_archive_file is not None:
                 if weight_name in [SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME]:
                     load_safe = True
     return state_dict
 def _trunc_normal_(tensor, mean, std, a, b):
     # Cut & paste from PyTorch official master until it's in a few official releases - RW
     # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
     def norm_cdf(x):
         # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
     if (mean < a - 2 * std) or (mean > b + 2 * std):
+        print(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
     # Values are generated by using a truncated uniform distribution and
     # then using the inverse CDF for the normal distribution.
     tensor.erfinv_()
     # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
     tensor.add_(mean)
     # Clamp to ensure it's in the proper range
     tensor.clamp_(min=a, max=b)
     return tensor
+def trunc_normal_tf_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
     r"""Fills the input Tensor with values drawn from a truncated
     normal distribution. The values are effectively drawn from the
     normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
         if module.padding_idx is not None:
             nn.init.zeros_(module.weight[module.padding_idx])
 def _ntuple(n):
     def parse(x):
         if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
             return tuple(x)
         return tuple(repeat(x, n))
     return parse
         position embeddings (with or without classification token)
     """
     grid_h = np.arange(grid_size, dtype=np.float32)
     grid_w = np.arange(grid_size, dtype=np.float32)
     grid = np.meshgrid(grid_w, grid_h)  # here w goes first
     grid = np.stack(grid, axis=0)
     emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
     return emb
 def ndgrid(*tensors) -> Tuple[torch.Tensor, ...]:
     """generate N-D grid in dimension order.
         # the old behaviour of meshgrid was 'ij'
         return torch.meshgrid(*tensors)
 def build_fourier_pos_embed(
+    feat_shape: List[int],
+    bands: Optional[torch.Tensor] = None,
+    num_bands: int = 64,
+    max_res: int = 224,
+    temperature: float = 10000.0,
+    linear_bands: bool = False,
+    include_grid: bool = False,
+    in_pixels: bool = True,
+    ref_feat_shape: Optional[List[int]] = None,
+    dtype: torch.dtype = torch.float32,
+    device: Optional[torch.device] = None,
 ) -> List[torch.Tensor]:
     """
             dtype = bands.dtype
     if in_pixels:
+        t = [torch.linspace(-1.0, 1.0, steps=s, device=device, dtype=torch.float32) for s in feat_shape]
     else:
         t = [torch.arange(s, device=device, dtype=torch.int64).to(torch.float32) for s in feat_shape]
 def build_rotary_pos_embed(
+    feat_shape: List[int],
+    bands: Optional[torch.Tensor] = None,
+    dim: int = 64,
+    max_res: int = 224,
+    temperature: float = 10000.0,
+    linear_bands: bool = False,
+    in_pixels: bool = True,
+    ref_feat_shape: Optional[List[int]] = None,
+    dtype: torch.dtype = torch.float32,
+    device: Optional[torch.device] = None,
 ):
     """
     cos_emb = cos_emb.reshape(num_spatial_dim, -1).repeat_interleave(2, -1)
     return sin_emb, cos_emb
 def freq_bands(
+    num_bands: int,
+    temperature: float = 10000.0,
+    step: int = 2,
+    device: Optional[torch.device] = None,
 ) -> torch.Tensor:
     exp = torch.arange(0, num_bands, step, dtype=torch.int64, device=device).to(torch.float32) / num_bands
+    bands = 1.0 / (temperature**exp)
     return bands
 def pixel_freq_bands(
+    num_bands: int,
+    max_freq: float = 224.0,
+    linear_bands: bool = True,
+    device: Optional[torch.device] = None,
 ):
     if linear_bands:
         bands = torch.linspace(1.0, max_freq / 2, num_bands, dtype=torch.float32, device=device)
         bands = 2 ** torch.linspace(0, math.log(max_freq, 2) - 1, num_bands, dtype=torch.float32, device=device)
     return bands * torch.pi
 def rot(x):
     return torch.stack([-x[..., 1::2], x[..., ::2]], -1).reshape(x.shape)
 def apply_rot_embed_cat(x: torch.Tensor, emb):
     sin_emb, cos_emb = emb.tensor_split(2, -1)
     if sin_emb.ndim == 3:
         return x * cos_emb.unsqueeze(1).expand_as(x) + rot(x) * sin_emb.unsqueeze(1).expand_as(x)
     return x * cos_emb + rot(x) * sin_emb
 # taken from https://github.com/huggingface/pytorch-image-models/blob/cb0e4391beedcc5ac3ae4bce16561b95c326f32c/timm/layers/pos_embed_sincos.py#L363
 class NomicVisionRotaryEmbeddingCat(nn.Module):
+    """Rotary position embedding w/ concatenatd sin & cos
     The following impl/resources were referenced for this impl:
     * https://github.com/lucidrains/vit-pytorch/blob/6f3a5fcf0bca1c5ec33a35ef48d97213709df4ba/vit_pytorch/rvt.py
     """
     def __init__(
+        self,
+        dim,
+        max_res=224,
+        temperature=10000,
+        in_pixels=True,
+        linear_bands: bool = False,
+        feat_shape: Optional[List[int]] = None,
+        ref_feat_shape: Optional[List[int]] = None,
     ):
         super().__init__()
         self.dim = dim
         pos_embed = self.get_embed(x.shape[2:])
         return apply_rot_embed_cat(x, pos_embed)
 class NomicVisionPatchEmbeddings(nn.Module):
     def __init__(
         self,
         self.sinusoidal_pos_embedding = False
         self.no_embed_class = getattr(config, "no_embed_class", False)
+        self.cls_token = (
+            nn.Parameter(torch.zeros(1, 1, config.n_embd)) if not getattr(config, "no_cls_token", False) else None
+        )
         if config.learned_pos_embedding:
             # this is the default in DINO
             self.learned_pos_embedding = True
             # hack for timm dinov2 with registers
             num_patches = self.num_patches if getattr(config, "register_tokens", 0) > 0 else self.num_patches + 1
+            self.pos_embed = (
+                nn.Parameter(torch.randn(1, num_patches, config.n_embd) * 0.02)
+                if getattr(config, "use_pos_embed", True)
+                else None
+            )
         elif getattr(config, "sinusoidal_pos_embedding", False):
             self.sinusoidal_pos_embedding = True
             if getattr(config, "use_pos_embed", True):
             else:
                 self.pos_embed = None
         else:
+            self.pos_embed = (
+                nn.Parameter(torch.randn(1, self.num_patches + 1, config.n_embd) * 0.02)
+                if getattr(config, "use_pos_embed", True)
+                else None
+            )
         if getattr(config, "register_tokens", 0) > 0:
             self.reg_token = nn.Parameter(torch.randn(1, config.register_tokens, config.n_embd) * 0.02)
         else:
+            self.reg_token = None
         if config.mask_token:
             self.mask_token = nn.Parameter(torch.zeros(1, config.n_embd))
         else:
             self.rope = None
     def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
         """
         This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
                 embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
             else:
                 if self.pos_embed is not None:
+                    embeddings = embeddings + self.pos_embed
             if to_cat:
                 embeddings = torch.cat(to_cat + [embeddings], dim=1)
         else:
                     embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
             else:
                 if self.pos_embed is not None:
+                    embeddings = embeddings + self.pos_embed
         embeddings = self.patch_dropout(embeddings)
                 qkv = rearrange(qkv, "b h three s d -> b s three h d")
         elif rope is not None:
             q, k, v = qkv.permute(0, 3, 1, 2, 4).unbind(dim=-2)
+            q = torch.cat(
+                [q[:, :, : self.num_prefix_tokens], apply_rot_embed_cat(q[:, :, self.num_prefix_tokens :], rope)], dim=2
+            ).type_as(q)
+            k = torch.cat(
+                [k[:, :, : self.num_prefix_tokens], apply_rot_embed_cat(k[:, :, self.num_prefix_tokens :], rope)], dim=2
+            ).type_as(q)
             qkv = torch.stack([q, k, v], dim=-2)
             qkv = rearrange(qkv, "b h s three d -> b s three h d")
         query = query.permute(0, 2, 1, 3)
         key = key.permute(0, 2, 1, 3)
         value = value.permute(0, 2, 1, 3)
+        if scaled_dot_product_attention is not None:
+            attn_output = F.scaled_dot_product_attention(
+                query, key, value, attn_mask=attention_mask, dropout_p=self.drop.p, is_causal=False
+            )
+        else:
+            attention_scores = torch.matmul(query, key.transpose(-1, -2)) / self.norm_factor
+            if attention_mask is not None:
+                attention_scores = attention_scores + attention_mask
+            attentions_probs = F.softmax(attention_scores, dim=-1)
+            attentions_probs = self.drop(attentions_probs)
+            attn_output = torch.matmul(attentions_probs, value)
         attn_output = rearrange(attn_output.permute(0, 2, 1, 3), "... h d -> ... (h d)")
         attn_output = self.out_proj(attn_output)
             attentions=outputs.attentions,
         )
 def hf_vit_config_to_vit_config(vit_config: ViTConfig) -> GPT2Config:
     return GPT2Config(
         n_embd=vit_config.hidden_size,
         n_head=vit_config.num_attention_heads,
         n_inner=vit_config.intermediate_size,
         activation_function=vit_config.hidden_act,
+        vocab_size=0,  # no vocab since using patches
         n_positions=0,  # No absolute position embedding
         resid_pdrop=0.0,  # No dropout
         embd_pdrop=getattr(vit_config, "dropout", 0.0),
         mask_token=False,
         learned_pos_embedding=False,
         patch_dropout=0,
+        sinusoidal_pos_embedding=vit_config.model_type == "vit_mae",
     )
 class NomicAttentionPooling(nn.Module):
+    def __init__(self, config):
         super().__init__()
         self.embed_dim = config.n_embd
         self.use_flash_attn = config.use_flash_attn
         self.Wq = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_proj_bias)
         self.Wkv = nn.Linear(self.embed_dim, kv_dim, bias=config.qkv_proj_bias)
         self.latent = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_proj_bias)
         self.drop = nn.Dropout(config.attn_pdrop)
     def init_weights(self):
+        trunc_normal_tf_(self.latent, std=self.embed_dim**-0.5)
     def forward(
         self,
         return attn_output
 class NomicMultiHeadAttentionPooling(nn.Module):
     def __init__(
         self,
         """
         attn_outputs = self.attn(
+            hidden_states,
+            attention_mask=attention_mask,
+        )
         normed = self.norm1(attn_outputs)
         hidden_states = hidden_states + self.mlp(normed)
         return hidden_states
 class NomicVisionPreTrainedModel(PreTrainedModel):
     """An abstract class to handle weights initialization and
     a simple interface for dowloading and loading pretrained models.
             )
         self.config = config
 class NomicVisionModel(NomicVisionPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.selector = NomicMultiHeadAttentionPooling(config)
         self.global_pool = getattr(config, "global_pool", None)
+        self.num_prefix_tokens = (1 if not getattr(config, "no_cls_token", False) else 0) + getattr(
+            config, "register_tokens", 0
+        )
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
         original_dtype = embeddings.dtype
+        hidden_states = embeddings
         # unused but easier to pass to gradient checkpointing as words
         residual = None
         for layer in self.layers:
             # need to pass none for backwards compatability
+            hidden_states, _, residual = layer(
+                hidden_states, None, residual=residual, is_padded_inputs=False, rope=rope
+            )
         hidden_states = hidden_states + residual
         if self.global_pool == "avg":
+            hidden_states = hidden_states[:, self.num_prefix_tokens :].mean(dim=1)
         pooled_output = self.selector(hidden_states)
         return BaseModelOutputWithPast(
             last_hidden_state=pooled_output,
             hidden_states=hidden_states,
+        )