allenai
/

Molmo-7B-D-0924

@@ -32,13 +32,13 @@ import einops
 from transformers import PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
-from olmo.util import resource_path
 from .configuration_molmo import (
     MolmoConfig,
     VisionBackboneConfig,
     VisionBackboneType,
     ImagePooling2DType,
-    ImageProjectType,
     AttentionType,
     MolmoConfigurationError,
 )
@@ -54,6 +54,20 @@ else:
 log = logging.getLogger(__name__)
 def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False):
     """
     Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf``
@@ -106,7 +120,7 @@ class Embedding(nn.Module):
     def reset_parameters(self):
         nn.init.normal_(self.embedding, std=self.initializer_range)
         nn.init.normal_(self.new_embedding, std=self.new_embed_initializer_range)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return F.embedding(x, torch.cat([self.embedding, self.new_embedding], dim=0))
@@ -131,7 +145,7 @@ class Dropout(nn.Dropout):
         if self.p == 0.0 and (self.mask_p is None or self.mask_p == 0.0):
             return input
         else:
-            if self.mask_p > 0. and self.training:
                 assert drop_mask is not None
                 drop_mask = drop_mask.to(input.dtype)
                 keep_prob = 1.0 - self.p
@@ -143,7 +157,7 @@ class Dropout(nn.Dropout):
                 multiplier = input.new_empty(dropout_shape).bernoulli_(keep_prob)
                 multiplier.div_(keep_prob)
                 return input * multiplier
-            elif self.p > 0. and len(self.broadcast_dims) > 0 and self.training:
                 keep_prob = 1.0 - self.p
                 dropout_shape = list(input.shape)
                 for dim in self.broadcast_dims:
@@ -212,7 +226,6 @@ class LayerNorm(LayerNormBase):
         else:
             return tensor
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.low_precision:
             module_device = x.device
@@ -227,7 +240,7 @@ class LayerNorm(LayerNormBase):
                 )
         else:
             return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
     def reset_parameters(self):
         if self.weight is not None:
             torch.nn.init.ones_(self.weight)  # type: ignore
@@ -239,6 +252,7 @@ class RMSLayerNorm(LayerNormBase):
     """
     RMS layer norm, a simplified :class:`LayerNorm` implementation
     """
     def __init__(
         self,
         config: MolmoConfig,
@@ -263,7 +277,7 @@ class RMSLayerNorm(LayerNormBase):
                 return self.weight * x
         else:
             return x
     def _cast_if_autocast_enabled(self, tensor: torch.Tensor, dtype: Optional[torch.dtype] = None) -> torch.Tensor:
         # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the separate function
         # `is_autocast_cpu_enabled()` for CPU autocast.
@@ -274,7 +288,7 @@ class RMSLayerNorm(LayerNormBase):
             return tensor.to(dtype=dtype if dtype is not None else torch.get_autocast_cpu_dtype())
         else:
             return tensor
     def reset_parameters(self):
         if self.weight is not None:
             torch.nn.init.ones_(self.weight)  # type: ignore
@@ -293,8 +307,7 @@ class RotaryEmbedding(nn.Module):
         self.__cache = cache
         # Warm up cache.
         self.get_rotary_embedding(
-            config.max_position_embeddings or config.max_sequence_length,
-            _non_meta_init_device(config)
         )
     def get_rotary_embedding(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -313,8 +326,14 @@ class RotaryEmbedding(nn.Module):
             return pos_sin[:, :, :seq_len, :], pos_cos[:, :, :seq_len, :]
         with torch.autocast(device.type, enabled=False):
-            dim = self.config.head_dim if self.config.head_dim is not None else self.config.d_model // self.config.n_heads
-            inv_freq = 1.0 / (self.config.rope_theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim))
             seq = torch.arange(seq_len, device=device, dtype=torch.float)
             freqs = einsum("i , j -> i j", seq, inv_freq)
             if self.config.rope_impl == "cockatoo":
@@ -346,10 +365,7 @@ class RotaryEmbedding(nn.Module):
             return ((t * pos_cos) + (self.rotate_half(t) * pos_sin)).to(t.dtype)
     def forward(
-        self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        position_ids: Optional[torch.Tensor] = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         if self.config.rope_full_precision:
             q_, k_ = q.float(), k.float()
@@ -360,7 +376,7 @@ class RotaryEmbedding(nn.Module):
             batch_size = q_.shape[0]
             query_len, key_len = q_.shape[-2], k_.shape[-2]  # could be different if layer_past not None
             if position_ids is not None:
-                freqs_cis_len = (self.config.max_position_embeddings or self.config.max_sequence_length)
             else:
                 freqs_cis_len = key_len
             pos_sin, pos_cos = self.get_rotary_embedding(freqs_cis_len, q_.device)
@@ -368,12 +384,8 @@ class RotaryEmbedding(nn.Module):
             pos_cos = pos_cos.type_as(q_)
             if position_ids is not None:
                 assert query_len == key_len, "Query and key lengths must be equal when using position IDs."
-                pos_sin = pos_sin[0, 0][position_ids].view(
-                    (batch_size, 1, key_len, pos_sin.shape[-1])
-                )
-                pos_cos = pos_cos[0, 0][position_ids].view(
-                    (batch_size, 1, key_len, pos_cos.shape[-1])
-                )
             q_ = self.apply_rotary_pos_emb(
                 pos_sin[:, :, key_len - query_len : key_len, :],
                 pos_cos[:, :, key_len - query_len : key_len, :],
@@ -466,11 +478,7 @@ def get_causal_attention_bias(cache: BufferCache, seq_len: int, device: torch.de
 class MolmoAttention(nn.Module):
-    def __init__(
-        self,
-        config: MolmoConfig,
-        cache: BufferCache
-    ):
         super().__init__()
         self.config = config
         self.__cache = cache
@@ -478,8 +486,7 @@ class MolmoAttention(nn.Module):
         self.k_norm: Optional[LayerNormBase] = None
         self.q_norm: Optional[LayerNormBase] = None
         self.hidden_size = (
-            config.mlp_hidden_size if config.mlp_hidden_size is not None \
-                else config.mlp_ratio * config.d_model
         )
         if config.attention_layer_norm:
@@ -508,29 +515,25 @@ class MolmoAttention(nn.Module):
             config.n_kv_heads * head_dim,
         )
         self.att_proj = nn.Linear(
-            config.d_model, sum(self.fused_dims),
             bias=config.include_bias or config.qkv_bias,
-            device=config.init_device
-        )
-        self.attn_out = nn.Linear(
-            input_dim, config.d_model,
-            bias=config.include_bias,
-            device=config.init_device
         )
-        self.attn_norm = RMSLayerNorm(
-            config,
-            size=config.d_model,
-            eps=config.layer_norm_eps)
-        self.flash_attn_func = None
         if self.config.attention_type == AttentionType.flash:
             try:
                 from flash_attn import flash_attn_func
                 self.flash_attn_func = flash_attn_func
             except ModuleNotFoundError:
                 pass
-    def attention(self,
         q: torch.Tensor,
         k: torch.Tensor,
         v: torch.Tensor,
@@ -541,7 +544,7 @@ class MolmoAttention(nn.Module):
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
         B, T, C = q.size()  # batch size, sequence length, d_model
-        dtype = k.dtype
         # Optionally apply layer norm to keys and queries.
         if self.q_norm is not None and self.k_norm is not None:
@@ -658,15 +661,7 @@ class MolmoAttention(nn.Module):
                 is_causal=is_causal,
             )
-    def forward(
-        self,
-        x,
-        attention_bias,
-        position_ids,
-        drop_mask,
-        layer_past,
-        use_cache
-    ):
         if not self.config.norm_after:
             atten_in = self.attn_norm(x)
         else:
@@ -678,54 +673,45 @@ class MolmoAttention(nn.Module):
             qkv.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
         q, k, v = qkv.split(self.fused_dims, dim=-1)
         # Get attention scores.
         att, cache = self.attention(
-            q, k, v,
             attention_bias,
             position_ids=position_ids,
             drop_mask=drop_mask,
             layer_past=layer_past,
-            use_cache=use_cache
         )
         if self.config.norm_after:
             att = self.attn_norm(att)
         return att, cache
 class MolmoMLP(nn.Module):
-    def __init__(
-        self,
-        config: MolmoConfig
-    ):
         # Feed-forward input projection.
         super().__init__()
         self.config = config
         self.hidden_size = (
-            config.mlp_hidden_size if config.mlp_hidden_size is not None \
-                else config.mlp_ratio * config.d_model
         )
         self.act = SwiGLU(config)
         self.ff_proj = nn.Linear(
-            config.d_model,
-            self.hidden_size,
-            bias=config.include_bias,
-            device=config.init_device
-        )
         self.ff_out = nn.Linear(
             int(self.act.output_multiplier * self.hidden_size),
             config.d_model,
             bias=config.include_bias,
             device=config.init_device,
         )
-        self.ff_norm = RMSLayerNorm(
-            config,
-            size=config.d_model,
-            eps=config.layer_norm_eps
-        )
     def forward(self, x):
         if not self.config.norm_after:
             x = self.ff_norm(x)
@@ -744,12 +730,8 @@ class MolmoDecoderLayer(nn.Module):
     """
     A base class for transformer block implementations.
     """
-    def __init__(
-        self,
-        layer_id: int,
-        config: MolmoConfig,
-        cache: BufferCache
-    ):
         super().__init__()
         self.self_attn = MolmoAttention(config, cache)
         self.mlp = MolmoMLP(config)
@@ -763,10 +745,7 @@ class MolmoDecoderLayer(nn.Module):
             assert config.d_model % config.n_heads == 0
         # Dropout.
-        self.dropout = Dropout(
-            config.residual_dropout,
-            mask_p=config.response_residual_dropout
-        )
     def forward(
         self,
@@ -787,12 +766,12 @@ class MolmoDecoderLayer(nn.Module):
         """
         att, cache = self.self_attn(
-            x,
             attention_bias=attention_bias,
             position_ids=position_ids,
             drop_mask=drop_mask,
             layer_past=layer_past,
-            use_cache=use_cache
         )
         x = x + self.dropout(att, drop_mask=drop_mask)
         og_x = x
@@ -822,7 +801,7 @@ class MultiHeadDotProductAttention(nn.Module):
         super().__init__()
         self.config = config
         self.use_bias = use_bias
         v_cfg = config.vision_backbone
         self.embed_dim = v_cfg.image_emb_dim
         self.num_heads = v_cfg.image_num_heads
@@ -862,7 +841,7 @@ class MultiHeadDotProductAttention(nn.Module):
         if v_cfg.attention_dropout > 0:
             self.attention_dropout = Dropout(v_cfg.attention_dropout, broadcast_dims=(0, 1))
         self.residual_dropout = Dropout(v_cfg.residual_dropout)
     def reset_parameters(self):
         nn.init.normal_(self.wq.weight, std=self.initializer_range)
         nn.init.normal_(self.wk.weight, std=self.initializer_range)
@@ -879,15 +858,15 @@ class MultiHeadDotProductAttention(nn.Module):
     def _merge_heads(self, hidden_states) -> torch.Tensor:
         return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
-    def forward(self, inputs_q: torch.Tensor, inputs_kv: Optional[torch.Tensor] = None) -> torch.Tensor:
         if inputs_kv is not None:
             inputs_k = inputs_kv
             inputs_v = inputs_kv
         else:
             inputs_k = inputs_q
             inputs_v = inputs_q
         xq, xk, xv = self.wq(inputs_q), self.wk(inputs_k), self.wv(inputs_v)
         xq = self._split_heads(xq, self.num_heads)
@@ -918,7 +897,7 @@ class MultiHeadDotProductAttention(nn.Module):
                 xk.transpose(1, 2).contiguous(),
                 xv.transpose(1, 2).contiguous(),
                 is_causal=False,
-                dropout_p=self.config.vision_backbone.attention_dropout
             ).transpose(1, 2)
         else:
             raise NotImplementedError(self.config.attention_type)
@@ -940,7 +919,7 @@ class MultiHeadAttentionPool(nn.Module):
         output_layer: bool = True,
         mean_residual: bool = False,
         query: str = "mean",
-        is_vit_layer: Optional[bool] = True
     ):
         super().__init__()
         self.config = config
@@ -950,7 +929,7 @@ class MultiHeadAttentionPool(nn.Module):
         self.output_layer = output_layer
         self.mean_residual = mean_residual
         self.query = query
         v_cfg = config.vision_backbone
         input_dim = v_cfg.image_emb_dim
         self.embed_dim = v_cfg.image_emb_dim * factor
@@ -985,7 +964,9 @@ class MultiHeadAttentionPool(nn.Module):
         if query == "vector":
             self.attention_query = nn.Parameter(
                 torch.zeros(
-                    1, self.num_key_value_heads * self.head_dim, device=config.init_device,
                 ),
             )
@@ -1024,7 +1005,6 @@ class MultiHeadAttentionPool(nn.Module):
         return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
     def forward(self, inputs_kv: torch.Tensor) -> torch.Tensor:
         xk, xv = self.wk(inputs_kv), self.wv(inputs_kv)
         if self.query == "mean":
@@ -1093,14 +1073,14 @@ class ViTMLP(nn.Module):
             bias=True,
             device=config.init_device,
         )
     def reset_parameters(self):
         v_cfg = self.config.vision_backbone
         nn.init.trunc_normal_(self.w1.weight, std=math.sqrt(1 / v_cfg.image_emb_dim), a=-2.0, b=2.0)
         nn.init.trunc_normal_(self.w2.weight, std=math.sqrt(1 / v_cfg.image_mlp_dim), a=-2.0, b=2.0)
         nn.init.zeros_(self.w1.bias)
         nn.init.zeros_(self.w2.bias)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.w1(x)
         x = self.act(x)
@@ -1111,7 +1091,7 @@ class ViTMLP(nn.Module):
 class MLP(nn.Module):
     def __init__(self, config: MolmoConfig, input_dim: int, dropout: float = 0.0):
         super().__init__()
-        self.config = config
         self.hidden_size = (
             config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
         )
@@ -1135,15 +1115,15 @@ class MLP(nn.Module):
             bias=False,
             device=config.init_device,
         )
-        #`MLP` assume the activation takes two inputs, so it must be a 'llama' version.
         self.act = LlamaSwiGLU(config)
         self.dropout = Dropout(dropout)
     def reset_parameters(self):
         nn.init.normal_(self.w1.weight, std=self.initializer_range)
         nn.init.normal_(self.w2.weight, std=self.initializer_range)
         nn.init.normal_(self.w3.weight, std=self.initializer_range)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.w2(self.act(self.w1(x), self.w3(x)))
         x = self.dropout(x)
@@ -1154,26 +1134,26 @@ class Residual(nn.Module):
     def __init__(self, submodule: nn.Module):
         super().__init__()
         self.submodule = submodule
     def reset_parameters(self):
         self.submodule.reset_parameters()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x + self.submodule(x)
 class LayerNormFp32(nn.LayerNorm):
-  """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back).
-  Derived from https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/transformer.py.
-  """
-  def forward(self, x: torch.Tensor) -> torch.Tensor:
-    orig_type = x.dtype
-    if self.training:
-        x = F.layer_norm(x.to(torch.float32), self.normalized_shape, self.weight, self.bias, self.eps)
-    else:
-        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
-    return x.to(orig_type)
 class ResidualAttentionBlock(nn.Module):
@@ -1200,7 +1180,7 @@ class ResidualAttentionBlock(nn.Module):
         self.feed_forward.reset_parameters()
         self.attention_norm.reset_parameters()
         self.ffn_norm.reset_parameters()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = x + self.attention(self.attention_norm(x))
         x = x + self.feed_forward(self.ffn_norm(x))
@@ -1213,10 +1193,8 @@ class BlockCollection(nn.Module):
         self.config = config
         v_cfg = config.vision_backbone
-        self.resblocks = nn.ModuleList([
-            ResidualAttentionBlock(config) for _ in range(v_cfg.image_num_layers)
-        ])
     def reset_parameters(self):
         for r in self.resblocks:
             r.reset_parameters()
@@ -1240,7 +1218,7 @@ class VisionTransformer(nn.Module):
         v_cfg = config.vision_backbone
         # class embeddings and positional embeddings
-        self.scale = v_cfg.image_emb_dim ** -0.5
         self.class_embedding = nn.Parameter(
             torch.zeros(v_cfg.image_emb_dim, device=config.init_device),
         )
@@ -1264,14 +1242,14 @@ class VisionTransformer(nn.Module):
         )
         self.transformer = BlockCollection(config)
     def reset_parameters(self):
         nn.init.normal_(self.class_embedding, std=self.scale)
         nn.init.normal_(self.positional_embedding, std=self.scale)
         nn.init.normal_(self.patch_embedding.weight, std=0.02)
         self.pre_ln.reset_parameters()
         self.transformer.reset_parameters()
     def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
         cls_emb = self.positional_embedding[0:1]
         pos_emb = self.positional_embedding[1:]
@@ -1279,7 +1257,7 @@ class VisionTransformer(nn.Module):
         pos_emb = pos_emb.reshape(
             (int(math.sqrt(pos_emb.shape[0])), int(math.sqrt(pos_emb.shape[0])), pos_emb.shape[1])
         )
         (patch_num_0, patch_num_1) = patch_num
         if pos_emb.shape[0] != patch_num_0 or pos_emb.shape[1] != patch_num_1:
@@ -1287,7 +1265,11 @@ class VisionTransformer(nn.Module):
             # antialias: default True in jax.image.resize
             pos_emb = pos_emb.unsqueeze(0).permute(0, 3, 1, 2)
             pos_emb = F.interpolate(
-                pos_emb, size=(patch_num_0, patch_num_1), mode="bicubic", align_corners=False, antialias=True,
             )
             pos_emb = pos_emb.permute(0, 2, 3, 1).squeeze(0)
@@ -1355,7 +1337,7 @@ class MolmoVisionBackbone(nn.Module):
             input_dim = nlayers * config.vision_backbone.image_emb_dim
         else:
             raise NotImplementedError(f"Unknown image pooling 2D method: {config.image_pooling_2d}")
         self.input_dim = input_dim
         self.image_projector = MLP(config, input_dim)
@@ -1380,9 +1362,11 @@ class MolmoVisionBackbone(nn.Module):
             self.image_projector.reset_parameters()
     @abstractmethod
-    def forward(self, images: torch.Tensor, image_masks: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         raise NotImplementedError
 class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
     def __init__(self, config: MolmoConfig):
@@ -1408,13 +1392,11 @@ class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
         self.pad_embed = None
         if config.image_padding_embed:
-            image_dim = v_cfg.image_emb_dim*len(self.config.vit_layers)
             if config.image_padding_embed in ["pad_embed", "regress"]:
-                self.pad_embed = nn.Parameter(
-                    torch.zeros((image_dim,), device=config.init_device))
             elif config.image_padding_embed == "pad_and_partial_pad":
-                self.pad_embed = nn.Parameter(
-                    torch.zeros((2, image_dim), device=config.init_device))
             else:
                 raise ValueError(config.image_padding_embed)
@@ -1423,7 +1405,8 @@ class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
         if self.config.vit_load_path:
             vit_load_path = Path(self.config.vit_load_path)
             state_dict_path = resource_path(
-                vit_load_path.parent, vit_load_path.name,
                 local_cache=vit_load_path.parent,
             )
             assert state_dict_path.is_file(), f"Model file {str(state_dict_path)} not found"
@@ -1441,7 +1424,7 @@ class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
         self.image_vit.reset_parameters()
         if self.config.use_cls_feature:
             nn.init.xavier_uniform_(self.cls_projector.weight)
     def encode_image(self, images: torch.Tensor) -> torch.Tensor:
         """
         : param images: (batch_size, num_crops, num_patch, n_pixels)
@@ -1469,15 +1452,17 @@ class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
         if self.num_prefix_tokens > 0:
             cls_embed = image_features[:, 0]
             image_features = image_features[:, 1:]
         image_features = image_features * mask
         image_features = image_features.view(B, T, N, -1)
         cls_embed = cls_embed.view(B, T, -1) if cls_embed is not None else None
         return image_features, cls_embed
-    def forward(self, images: torch.Tensor, image_masks: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         cfg = self.config
         # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim)
@@ -1493,12 +1478,16 @@ class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
                 image_features = image_features + pad_embed * torch.unsqueeze(all_pad, -1)
             elif cfg.image_padding_embed == "regress":
                 pad_embed = self.pad_embed[None, None, None, :]
-                image_features = image_features + pad_embed * torch.unsqueeze(torch.maximum(image_masks, torch.zeros_like(image_masks)), -1)
             elif cfg.image_padding_embed == "pad_and_partial_pad":
                 og_dtype = image_features.dtype
                 pad_embed = self.pad_embed[:, None, None, None, :]
                 all_pad = image_masks == 0
-                partial_pad = torch.logical_and(image_masks < 1, torch.logical_not(all_pad)).to(dtype=torch.float32)
                 all_pad = all_pad.to(dtype=torch.float32)
                 image_features = image_features + pad_embed[0] * torch.unsqueeze(all_pad, -1)
                 image_features = image_features + pad_embed[1] * torch.unsqueeze(partial_pad, -1)
@@ -1509,7 +1498,7 @@ class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
         image_features = self.image_feature_dropout(image_features)
         if cls_embed is not None:
             cls_embed = self.image_feature_dropout(cls_embed)
         image_features = image_features.reshape(
             (batch_size, num_image) + cfg.vision_backbone.image_num_patch + (-1,),
         )
@@ -1520,11 +1509,11 @@ class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
                 image_features,
                 (0, 0, 0, 1, 0, 1, 0, 0, 0, 0),
             )
         # image pooling
         image_features = einops.rearrange(
             image_features,
-            'b n (h dh) (w dw) c -> (b n h w) (dh dw) c',
             dh=cfg.image_pooling_h,
             dw=cfg.image_pooling_w,
         )
@@ -1546,7 +1535,7 @@ class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
                 image_features = module(image_features)
         else:
             image_features = self.image_projector(image_features)
         if self.config.use_cls_feature:
             cls_embed = self.cls_projector(cls_embed)
             if cfg.image_projector == ImageProjectType.mlpx2:
@@ -1554,7 +1543,7 @@ class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
                     cls_embed = module(cls_embed)
             else:
                 cls_embed = self.image_projector(cls_embed)
         # image_features: (batch_size, num_image, num_patch, d_model)
         # cls_embed: (batch_size, num_image, d_model)
         return image_features, cls_embed
@@ -1579,11 +1568,7 @@ class MolmoPretrainedModel(PreTrainedModel):
 class MolmoModel(MolmoPretrainedModel):
-    def __init__(
-        self,
-        config: MolmoConfig,
-        init_params: bool = True
-    ):
         super().__init__(config)
         self.config = config
         self.__cache = BufferCache()
@@ -1616,10 +1601,10 @@ class MolmoModel(MolmoPretrainedModel):
                 config.d_model,
                 device=config.init_device,
                 initializer_range=config.initializer_range,
-                new_embed_initializer_range=config.new_embedding_init_range
             )
         else:
-            wte=nn.Embedding(
                 config.embedding_size or config.vocab_size, config.d_model, device=config.init_device
             )
@@ -1627,26 +1612,20 @@ class MolmoModel(MolmoPretrainedModel):
             dict(
                 wte=wte,
                 emb_drop=Dropout(config.embedding_dropout),
-                ln_f=RMSLayerNorm(
-                    config,
-                    size=config.d_model,
-                    eps=config.layer_norm_eps),
             )
         )
-        layers = [
-            MolmoDecoderLayer(i, config, self.__cache) \
-                for i in range(config.n_layers)
-        ]
         self.transformer.update({"layers": nn.ModuleList(layers)})
         self.vision_backbone: Optional[MolmoVisionBackbone] = None
         if config.vision_backbone is not None:
             self.vision_backbone = MolmoVisionBackbone.build(config)
         if self.vision_backbone is not None:
             self.vision_backbone.reset_with_pretrained_weights()
     @property
     def device(self) -> torch.device:
         device: torch.device = self.transformer.wte.weight.device  # type: ignore
@@ -1655,7 +1634,6 @@ class MolmoModel(MolmoPretrainedModel):
         else:
             return device
     def forward(
         self,
         input_ids: torch.LongTensor,
@@ -1716,7 +1694,9 @@ class MolmoModel(MolmoPretrainedModel):
         has_image = images is not None
         assert not (has_image and input_embeddings is not None), "Cannot provide both images and input embeddings."
-        assert not (has_image and past_key_values is not None), "Cached key and values should not be used with images."
         batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2]
         if past_key_values is None:
@@ -1730,16 +1710,17 @@ class MolmoModel(MolmoPretrainedModel):
         if self.config.use_position_ids and attention_mask is None:
             attention_mask = input_ids != -1
         if subsegment_ids is not None:
             assert not use_cache, "Subsegment_ids cannot be used with cache."
             subsegment_mask = subsegment_ids.unsqueeze(2) <= subsegment_ids.unsqueeze(1)
             attention_mask = (
-                subsegment_mask.to(attention_mask.dtype) *
-                attention_mask.unsqueeze(2) *
-                attention_mask.unsqueeze(1))
             if position_ids is None:
-                raise ValueError(f"Positioned ids must be given if using subsegment_ids")
         else:
             if self.config.use_position_ids and position_ids is None:
                 position_ids = torch.clamp(
@@ -1776,10 +1757,8 @@ class MolmoModel(MolmoPretrainedModel):
             if self.config.use_cls_feature:
                 x = torch.cat([x[:, :1], cls_embed, x[:, 1:-num_image]], dim=1)
-                valid_images = torch.any(
-                    (image_input_idx >= 0).view(batch_size, num_image, num_patch), dim=-1
-                )
                 valid_images = valid_images.to(attention_mask.dtype)
                 attention_mask = torch.cat(
                     [attention_mask[:, :1], valid_images, attention_mask[:, 1:-num_image]],
@@ -1796,13 +1775,13 @@ class MolmoModel(MolmoPretrainedModel):
         # normalized
         if self.config.normalize_input_embeds:
-            x = x * (self.config.d_model ** 0.5)
         # Transform the attention mask into what the blocks expect.
         if attention_mask is not None:
             # shape: (batch_size, 1, 1, seq_len)
             if len(attention_mask.shape) == 2:
-                attention_mask = attention_mask[:, :past_length + seq_len]
                 attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[:, None, None, :]
             else:
                 attention_mask = attention_mask.unsqueeze(1).to(dtype=torch.float)
@@ -1852,16 +1831,23 @@ class MolmoModel(MolmoPretrainedModel):
             layer_past = None if past_key_values is None else past_key_values[block_idx]
             # shape: (batch_size, seq_len, d_model)
-            x, cache = layer(x, attention_bias=attention_bias, position_ids=position_ids, drop_mask=response_mask, layer_past=layer_past, use_cache=use_cache)
             if attn_key_values is not None:
                 assert cache is not None
                 attn_key_values.append(cache)
         if images is not None and self.config.use_cls_feature:
             assert num_image is not None
             x = torch.cat(
-                [x[:, :1], x[:, num_image+1:], torch.zeros_like(x[:, :num_image])],
                 dim=1,
             )
@@ -1869,7 +1855,8 @@ class MolmoModel(MolmoPretrainedModel):
             # shape: (batch_size, 1, d_model)
             if append_last_valid_logits is not None:
                 last_valid_output = x[
-                    torch.arange(x.shape[0], device=x.device), append_last_valid_logits.to(x.device)]
                 x = last_valid_output.unsqueeze(1)
             else:
                 x = x[:, -1, :].unsqueeze(1)
@@ -1886,23 +1873,20 @@ class MolmoModel(MolmoPretrainedModel):
         return MolmoOutput(
             last_hidden_states=x,
             attn_key_values=attn_key_values,
-            hidden_states=tuple(all_hidden_states) \
-                if output_hidden_states else None
-            )
 class MolmoForCausalLM(PreTrainedModel):
     """
     Extremely barebones HF model wrapper.
     """
     config_class = MolmoConfig
     base_model_prefix = "model"
     _no_split_modules = ["MolmoDecoderLayer"]
-    def __init__(
-        self,
-        config: MolmoConfig
-    ):
         super().__init__(config)
         # model_config = create_model_config_from_pretrained_config(config)
         # Initialize model (always on CPU to start with so we don't run out of GPU memory).
@@ -1972,7 +1956,7 @@ class MolmoForCausalLM(PreTrainedModel):
             output_hidden_states=output_hidden_states,
             append_last_valid_logits=append_last_valid_logits,
         )
         x = outputs.last_hidden_states
         if self.config.weight_tying:
             logits = F.linear(x, self.model.transformer.wte.weight, None)  # type: ignore
@@ -1981,15 +1965,16 @@ class MolmoForCausalLM(PreTrainedModel):
         if self.config.scale_logits:
             logits.mul_(1 / math.sqrt(self.config.d_model))
         if self.config.final_logit_softcapping is not None:
             logits = logits / self.config.final_logit_softcapping
             logits = torch.tanh(logits)
             logits = logits * self.config.final_logit_softcapping
         if not last_logits_only and append_last_valid_logits is not None:
             last_valid_logit = logits[
-                torch.arange(logits.shape[0], device=logits.device), append_last_valid_logits]
             logits = torch.cat([logits[:, :-1], last_valid_logit[:, None]], dim=1)
         loss = None
@@ -2001,7 +1986,7 @@ class MolmoForCausalLM(PreTrainedModel):
                 labels.masked_fill_(~(loss_masks > 0), -100)
                 labels = labels.view(-1)
                 logits_for_loss = logits.to(torch.float32).view(-1, logits.size(-1))
-                loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none')
                 loss = loss_fct(logits_for_loss, labels)
                 loss = loss.view(input_ids.shape[0], -1)
                 loss = loss * loss_masks
@@ -2063,10 +2048,7 @@ class MolmoForCausalLM(PreTrainedModel):
         append_last_valid_logits: Optional[torch.Tensor] = None
         if self.config.use_position_ids and attention_mask is None:
             attention_mask = input_ids != -1
-            position_ids = torch.clamp(
-                torch.cumsum(attention_mask.to(torch.int32), dim=-1) - 1,
-                min=0
-            )
             append_last_valid_logits = attention_mask.long().sum(dim=-1) - 1
             attention_mask = torch.cat(
                 [attention_mask, attention_mask.new_ones((batch_size, max_new_tokens))],
@@ -2074,7 +2056,7 @@ class MolmoForCausalLM(PreTrainedModel):
             )
         if attention_mask is not None:
             assert attention_mask.shape == (batch_size, mask_len)
         out = super().generate(
             input_ids,
             generation_config,
@@ -2088,7 +2070,7 @@ class MolmoForCausalLM(PreTrainedModel):
         )
         return out
     def prepare_inputs_for_generation(
         self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs
     ):
@@ -2116,7 +2098,7 @@ class MolmoForCausalLM(PreTrainedModel):
                 model_inputs["image_masks"] = image_masks
                 model_inputs["image_input_idx"] = image_input_idx
                 model_inputs["append_last_valid_logits"] = append_last_valid_logits
-        else:
             model_inputs = {"input_ids": input_ids, "past_key_values": past_key_values}
             model_inputs.update(kwargs)
@@ -2236,7 +2218,4 @@ class MolmoForCausalLM(PreTrainedModel):
         # Tie weights again if needed
         self.tie_weights()
-        return model_embeds
-# Always register for multi-modal features
-AutoModelForCausalLM.register(MolmoConfig, MolmoForCausalLM)

 from transformers import PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
+# from olmo.util import resource_path
 from .configuration_molmo import (
     MolmoConfig,
     VisionBackboneConfig,
     VisionBackboneType,
     ImagePooling2DType,
+    ImageProjectType,
     AttentionType,
     MolmoConfigurationError,
 )
 log = logging.getLogger(__name__)
+def resource_path(
+    folder: Union[str, Path],
+    fname: str,
+    local_cache: Optional[Union[str, Path]] = None,
+) -> Path:
+    if local_cache is not None and (local_path := Path(local_cache) / fname).is_file():
+        log.info(f"Found local cache of {fname} at {local_path}")
+        return local_path
+    else:
+        from cached_path import cached_path
+        return cached_path(f"{str(folder).rstrip('/')}/{fname}")
 def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False):
     """
     Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf``
     def reset_parameters(self):
         nn.init.normal_(self.embedding, std=self.initializer_range)
         nn.init.normal_(self.new_embedding, std=self.new_embed_initializer_range)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return F.embedding(x, torch.cat([self.embedding, self.new_embedding], dim=0))
         if self.p == 0.0 and (self.mask_p is None or self.mask_p == 0.0):
             return input
         else:
+            if self.mask_p > 0.0 and self.training:
                 assert drop_mask is not None
                 drop_mask = drop_mask.to(input.dtype)
                 keep_prob = 1.0 - self.p
                 multiplier = input.new_empty(dropout_shape).bernoulli_(keep_prob)
                 multiplier.div_(keep_prob)
                 return input * multiplier
+            elif self.p > 0.0 and len(self.broadcast_dims) > 0 and self.training:
                 keep_prob = 1.0 - self.p
                 dropout_shape = list(input.shape)
                 for dim in self.broadcast_dims:
         else:
             return tensor
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.low_precision:
             module_device = x.device
                 )
         else:
             return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
     def reset_parameters(self):
         if self.weight is not None:
             torch.nn.init.ones_(self.weight)  # type: ignore
     """
     RMS layer norm, a simplified :class:`LayerNorm` implementation
     """
     def __init__(
         self,
         config: MolmoConfig,
                 return self.weight * x
         else:
             return x
     def _cast_if_autocast_enabled(self, tensor: torch.Tensor, dtype: Optional[torch.dtype] = None) -> torch.Tensor:
         # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the separate function
         # `is_autocast_cpu_enabled()` for CPU autocast.
             return tensor.to(dtype=dtype if dtype is not None else torch.get_autocast_cpu_dtype())
         else:
             return tensor
     def reset_parameters(self):
         if self.weight is not None:
             torch.nn.init.ones_(self.weight)  # type: ignore
         self.__cache = cache
         # Warm up cache.
         self.get_rotary_embedding(
+            config.max_position_embeddings or config.max_sequence_length, _non_meta_init_device(config)
         )
     def get_rotary_embedding(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
             return pos_sin[:, :, :seq_len, :], pos_cos[:, :, :seq_len, :]
         with torch.autocast(device.type, enabled=False):
+            dim = (
+                self.config.head_dim
+                if self.config.head_dim is not None
+                else self.config.d_model // self.config.n_heads
+            )
+            inv_freq = 1.0 / (
+                self.config.rope_theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim)
+            )
             seq = torch.arange(seq_len, device=device, dtype=torch.float)
             freqs = einsum("i , j -> i j", seq, inv_freq)
             if self.config.rope_impl == "cockatoo":
             return ((t * pos_cos) + (self.rotate_half(t) * pos_sin)).to(t.dtype)
     def forward(
+        self, q: torch.Tensor, k: torch.Tensor, position_ids: Optional[torch.Tensor] = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         if self.config.rope_full_precision:
             q_, k_ = q.float(), k.float()
             batch_size = q_.shape[0]
             query_len, key_len = q_.shape[-2], k_.shape[-2]  # could be different if layer_past not None
             if position_ids is not None:
+                freqs_cis_len = self.config.max_position_embeddings or self.config.max_sequence_length
             else:
                 freqs_cis_len = key_len
             pos_sin, pos_cos = self.get_rotary_embedding(freqs_cis_len, q_.device)
             pos_cos = pos_cos.type_as(q_)
             if position_ids is not None:
                 assert query_len == key_len, "Query and key lengths must be equal when using position IDs."
+                pos_sin = pos_sin[0, 0][position_ids].view((batch_size, 1, key_len, pos_sin.shape[-1]))
+                pos_cos = pos_cos[0, 0][position_ids].view((batch_size, 1, key_len, pos_cos.shape[-1]))
             q_ = self.apply_rotary_pos_emb(
                 pos_sin[:, :, key_len - query_len : key_len, :],
                 pos_cos[:, :, key_len - query_len : key_len, :],
 class MolmoAttention(nn.Module):
+    def __init__(self, config: MolmoConfig, cache: BufferCache):
         super().__init__()
         self.config = config
         self.__cache = cache
         self.k_norm: Optional[LayerNormBase] = None
         self.q_norm: Optional[LayerNormBase] = None
         self.hidden_size = (
+            config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
         )
         if config.attention_layer_norm:
             config.n_kv_heads * head_dim,
         )
         self.att_proj = nn.Linear(
+            config.d_model,
+            sum(self.fused_dims),
             bias=config.include_bias or config.qkv_bias,
+            device=config.init_device,
         )
+        self.attn_out = nn.Linear(input_dim, config.d_model, bias=config.include_bias, device=config.init_device)
+        self.attn_norm = RMSLayerNorm(config, size=config.d_model, eps=config.layer_norm_eps)
+        self.flash_attn_func = None
         if self.config.attention_type == AttentionType.flash:
             try:
                 from flash_attn import flash_attn_func
                 self.flash_attn_func = flash_attn_func
             except ModuleNotFoundError:
                 pass
+    def attention(
+        self,
         q: torch.Tensor,
         k: torch.Tensor,
         v: torch.Tensor,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
         B, T, C = q.size()  # batch size, sequence length, d_model
+        dtype = k.dtype
         # Optionally apply layer norm to keys and queries.
         if self.q_norm is not None and self.k_norm is not None:
                 is_causal=is_causal,
             )
+    def forward(self, x, attention_bias, position_ids, drop_mask, layer_past, use_cache):
         if not self.config.norm_after:
             atten_in = self.attn_norm(x)
         else:
             qkv.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
         q, k, v = qkv.split(self.fused_dims, dim=-1)
         # Get attention scores.
         att, cache = self.attention(
+            q,
+            k,
+            v,
             attention_bias,
             position_ids=position_ids,
             drop_mask=drop_mask,
             layer_past=layer_past,
+            use_cache=use_cache,
         )
         if self.config.norm_after:
             att = self.attn_norm(att)
         return att, cache
 class MolmoMLP(nn.Module):
+    def __init__(self, config: MolmoConfig):
         # Feed-forward input projection.
         super().__init__()
         self.config = config
         self.hidden_size = (
+            config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
         )
         self.act = SwiGLU(config)
         self.ff_proj = nn.Linear(
+            config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+        )
         self.ff_out = nn.Linear(
             int(self.act.output_multiplier * self.hidden_size),
             config.d_model,
             bias=config.include_bias,
             device=config.init_device,
         )
+        self.ff_norm = RMSLayerNorm(config, size=config.d_model, eps=config.layer_norm_eps)
     def forward(self, x):
         if not self.config.norm_after:
             x = self.ff_norm(x)
     """
     A base class for transformer block implementations.
     """
+    def __init__(self, layer_id: int, config: MolmoConfig, cache: BufferCache):
         super().__init__()
         self.self_attn = MolmoAttention(config, cache)
         self.mlp = MolmoMLP(config)
             assert config.d_model % config.n_heads == 0
         # Dropout.
+        self.dropout = Dropout(config.residual_dropout, mask_p=config.response_residual_dropout)
     def forward(
         self,
         """
         att, cache = self.self_attn(
+            x,
             attention_bias=attention_bias,
             position_ids=position_ids,
             drop_mask=drop_mask,
             layer_past=layer_past,
+            use_cache=use_cache,
         )
         x = x + self.dropout(att, drop_mask=drop_mask)
         og_x = x
         super().__init__()
         self.config = config
         self.use_bias = use_bias
         v_cfg = config.vision_backbone
         self.embed_dim = v_cfg.image_emb_dim
         self.num_heads = v_cfg.image_num_heads
         if v_cfg.attention_dropout > 0:
             self.attention_dropout = Dropout(v_cfg.attention_dropout, broadcast_dims=(0, 1))
         self.residual_dropout = Dropout(v_cfg.residual_dropout)
     def reset_parameters(self):
         nn.init.normal_(self.wq.weight, std=self.initializer_range)
         nn.init.normal_(self.wk.weight, std=self.initializer_range)
     def _merge_heads(self, hidden_states) -> torch.Tensor:
         return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+    def forward(self, inputs_q: torch.Tensor, inputs_kv: Optional[torch.Tensor] = None) -> torch.Tensor:
         if inputs_kv is not None:
             inputs_k = inputs_kv
             inputs_v = inputs_kv
         else:
             inputs_k = inputs_q
             inputs_v = inputs_q
         xq, xk, xv = self.wq(inputs_q), self.wk(inputs_k), self.wv(inputs_v)
         xq = self._split_heads(xq, self.num_heads)
                 xk.transpose(1, 2).contiguous(),
                 xv.transpose(1, 2).contiguous(),
                 is_causal=False,
+                dropout_p=self.config.vision_backbone.attention_dropout,
             ).transpose(1, 2)
         else:
             raise NotImplementedError(self.config.attention_type)
         output_layer: bool = True,
         mean_residual: bool = False,
         query: str = "mean",
+        is_vit_layer: Optional[bool] = True,
     ):
         super().__init__()
         self.config = config
         self.output_layer = output_layer
         self.mean_residual = mean_residual
         self.query = query
         v_cfg = config.vision_backbone
         input_dim = v_cfg.image_emb_dim
         self.embed_dim = v_cfg.image_emb_dim * factor
         if query == "vector":
             self.attention_query = nn.Parameter(
                 torch.zeros(
+                    1,
+                    self.num_key_value_heads * self.head_dim,
+                    device=config.init_device,
                 ),
             )
         return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
     def forward(self, inputs_kv: torch.Tensor) -> torch.Tensor:
         xk, xv = self.wk(inputs_kv), self.wv(inputs_kv)
         if self.query == "mean":
             bias=True,
             device=config.init_device,
         )
     def reset_parameters(self):
         v_cfg = self.config.vision_backbone
         nn.init.trunc_normal_(self.w1.weight, std=math.sqrt(1 / v_cfg.image_emb_dim), a=-2.0, b=2.0)
         nn.init.trunc_normal_(self.w2.weight, std=math.sqrt(1 / v_cfg.image_mlp_dim), a=-2.0, b=2.0)
         nn.init.zeros_(self.w1.bias)
         nn.init.zeros_(self.w2.bias)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.w1(x)
         x = self.act(x)
 class MLP(nn.Module):
     def __init__(self, config: MolmoConfig, input_dim: int, dropout: float = 0.0):
         super().__init__()
+        self.config = config
         self.hidden_size = (
             config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
         )
             bias=False,
             device=config.init_device,
         )
+        # `MLP` assume the activation takes two inputs, so it must be a 'llama' version.
         self.act = LlamaSwiGLU(config)
         self.dropout = Dropout(dropout)
     def reset_parameters(self):
         nn.init.normal_(self.w1.weight, std=self.initializer_range)
         nn.init.normal_(self.w2.weight, std=self.initializer_range)
         nn.init.normal_(self.w3.weight, std=self.initializer_range)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.w2(self.act(self.w1(x), self.w3(x)))
         x = self.dropout(x)
     def __init__(self, submodule: nn.Module):
         super().__init__()
         self.submodule = submodule
     def reset_parameters(self):
         self.submodule.reset_parameters()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x + self.submodule(x)
 class LayerNormFp32(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back).
+    Derived from https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/transformer.py.
+    """
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        orig_type = x.dtype
+        if self.training:
+            x = F.layer_norm(x.to(torch.float32), self.normalized_shape, self.weight, self.bias, self.eps)
+        else:
+            x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        return x.to(orig_type)
 class ResidualAttentionBlock(nn.Module):
         self.feed_forward.reset_parameters()
         self.attention_norm.reset_parameters()
         self.ffn_norm.reset_parameters()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = x + self.attention(self.attention_norm(x))
         x = x + self.feed_forward(self.ffn_norm(x))
         self.config = config
         v_cfg = config.vision_backbone
+        self.resblocks = nn.ModuleList([ResidualAttentionBlock(config) for _ in range(v_cfg.image_num_layers)])
     def reset_parameters(self):
         for r in self.resblocks:
             r.reset_parameters()
         v_cfg = config.vision_backbone
         # class embeddings and positional embeddings
+        self.scale = v_cfg.image_emb_dim**-0.5
         self.class_embedding = nn.Parameter(
             torch.zeros(v_cfg.image_emb_dim, device=config.init_device),
         )
         )
         self.transformer = BlockCollection(config)
     def reset_parameters(self):
         nn.init.normal_(self.class_embedding, std=self.scale)
         nn.init.normal_(self.positional_embedding, std=self.scale)
         nn.init.normal_(self.patch_embedding.weight, std=0.02)
         self.pre_ln.reset_parameters()
         self.transformer.reset_parameters()
     def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
         cls_emb = self.positional_embedding[0:1]
         pos_emb = self.positional_embedding[1:]
         pos_emb = pos_emb.reshape(
             (int(math.sqrt(pos_emb.shape[0])), int(math.sqrt(pos_emb.shape[0])), pos_emb.shape[1])
         )
         (patch_num_0, patch_num_1) = patch_num
         if pos_emb.shape[0] != patch_num_0 or pos_emb.shape[1] != patch_num_1:
             # antialias: default True in jax.image.resize
             pos_emb = pos_emb.unsqueeze(0).permute(0, 3, 1, 2)
             pos_emb = F.interpolate(
+                pos_emb,
+                size=(patch_num_0, patch_num_1),
+                mode="bicubic",
+                align_corners=False,
+                antialias=True,
             )
             pos_emb = pos_emb.permute(0, 2, 3, 1).squeeze(0)
             input_dim = nlayers * config.vision_backbone.image_emb_dim
         else:
             raise NotImplementedError(f"Unknown image pooling 2D method: {config.image_pooling_2d}")
         self.input_dim = input_dim
         self.image_projector = MLP(config, input_dim)
             self.image_projector.reset_parameters()
     @abstractmethod
+    def forward(
+        self, images: torch.Tensor, image_masks: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         raise NotImplementedError
 class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
     def __init__(self, config: MolmoConfig):
         self.pad_embed = None
         if config.image_padding_embed:
+            image_dim = v_cfg.image_emb_dim * len(self.config.vit_layers)
             if config.image_padding_embed in ["pad_embed", "regress"]:
+                self.pad_embed = nn.Parameter(torch.zeros((image_dim,), device=config.init_device))
             elif config.image_padding_embed == "pad_and_partial_pad":
+                self.pad_embed = nn.Parameter(torch.zeros((2, image_dim), device=config.init_device))
             else:
                 raise ValueError(config.image_padding_embed)
         if self.config.vit_load_path:
             vit_load_path = Path(self.config.vit_load_path)
             state_dict_path = resource_path(
+                vit_load_path.parent,
+                vit_load_path.name,
                 local_cache=vit_load_path.parent,
             )
             assert state_dict_path.is_file(), f"Model file {str(state_dict_path)} not found"
         self.image_vit.reset_parameters()
         if self.config.use_cls_feature:
             nn.init.xavier_uniform_(self.cls_projector.weight)
     def encode_image(self, images: torch.Tensor) -> torch.Tensor:
         """
         : param images: (batch_size, num_crops, num_patch, n_pixels)
         if self.num_prefix_tokens > 0:
             cls_embed = image_features[:, 0]
             image_features = image_features[:, 1:]
         image_features = image_features * mask
         image_features = image_features.view(B, T, N, -1)
         cls_embed = cls_embed.view(B, T, -1) if cls_embed is not None else None
         return image_features, cls_embed
+    def forward(
+        self, images: torch.Tensor, image_masks: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         cfg = self.config
         # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim)
                 image_features = image_features + pad_embed * torch.unsqueeze(all_pad, -1)
             elif cfg.image_padding_embed == "regress":
                 pad_embed = self.pad_embed[None, None, None, :]
+                image_features = image_features + pad_embed * torch.unsqueeze(
+                    torch.maximum(image_masks, torch.zeros_like(image_masks)), -1
+                )
             elif cfg.image_padding_embed == "pad_and_partial_pad":
                 og_dtype = image_features.dtype
                 pad_embed = self.pad_embed[:, None, None, None, :]
                 all_pad = image_masks == 0
+                partial_pad = torch.logical_and(image_masks < 1, torch.logical_not(all_pad)).to(
+                    dtype=torch.float32
+                )
                 all_pad = all_pad.to(dtype=torch.float32)
                 image_features = image_features + pad_embed[0] * torch.unsqueeze(all_pad, -1)
                 image_features = image_features + pad_embed[1] * torch.unsqueeze(partial_pad, -1)
         image_features = self.image_feature_dropout(image_features)
         if cls_embed is not None:
             cls_embed = self.image_feature_dropout(cls_embed)
         image_features = image_features.reshape(
             (batch_size, num_image) + cfg.vision_backbone.image_num_patch + (-1,),
         )
                 image_features,
                 (0, 0, 0, 1, 0, 1, 0, 0, 0, 0),
             )
         # image pooling
         image_features = einops.rearrange(
             image_features,
+            "b n (h dh) (w dw) c -> (b n h w) (dh dw) c",
             dh=cfg.image_pooling_h,
             dw=cfg.image_pooling_w,
         )
                 image_features = module(image_features)
         else:
             image_features = self.image_projector(image_features)
         if self.config.use_cls_feature:
             cls_embed = self.cls_projector(cls_embed)
             if cfg.image_projector == ImageProjectType.mlpx2:
                     cls_embed = module(cls_embed)
             else:
                 cls_embed = self.image_projector(cls_embed)
         # image_features: (batch_size, num_image, num_patch, d_model)
         # cls_embed: (batch_size, num_image, d_model)
         return image_features, cls_embed
 class MolmoModel(MolmoPretrainedModel):
+    def __init__(self, config: MolmoConfig, init_params: bool = True):
         super().__init__(config)
         self.config = config
         self.__cache = BufferCache()
                 config.d_model,
                 device=config.init_device,
                 initializer_range=config.initializer_range,
+                new_embed_initializer_range=config.new_embedding_init_range,
             )
         else:
+            wte = nn.Embedding(
                 config.embedding_size or config.vocab_size, config.d_model, device=config.init_device
             )
             dict(
                 wte=wte,
                 emb_drop=Dropout(config.embedding_dropout),
+                ln_f=RMSLayerNorm(config, size=config.d_model, eps=config.layer_norm_eps),
             )
         )
+        layers = [MolmoDecoderLayer(i, config, self.__cache) for i in range(config.n_layers)]
         self.transformer.update({"layers": nn.ModuleList(layers)})
         self.vision_backbone: Optional[MolmoVisionBackbone] = None
         if config.vision_backbone is not None:
             self.vision_backbone = MolmoVisionBackbone.build(config)
         if self.vision_backbone is not None:
             self.vision_backbone.reset_with_pretrained_weights()
     @property
     def device(self) -> torch.device:
         device: torch.device = self.transformer.wte.weight.device  # type: ignore
         else:
             return device
     def forward(
         self,
         input_ids: torch.LongTensor,
         has_image = images is not None
         assert not (has_image and input_embeddings is not None), "Cannot provide both images and input embeddings."
+        assert not (
+            has_image and past_key_values is not None
+        ), "Cached key and values should not be used with images."
         batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2]
         if past_key_values is None:
         if self.config.use_position_ids and attention_mask is None:
             attention_mask = input_ids != -1
         if subsegment_ids is not None:
             assert not use_cache, "Subsegment_ids cannot be used with cache."
             subsegment_mask = subsegment_ids.unsqueeze(2) <= subsegment_ids.unsqueeze(1)
             attention_mask = (
+                subsegment_mask.to(attention_mask.dtype)
+                * attention_mask.unsqueeze(2)
+                * attention_mask.unsqueeze(1)
+            )
             if position_ids is None:
+                raise ValueError("Positioned ids must be given if using subsegment_ids")
         else:
             if self.config.use_position_ids and position_ids is None:
                 position_ids = torch.clamp(
             if self.config.use_cls_feature:
                 x = torch.cat([x[:, :1], cls_embed, x[:, 1:-num_image]], dim=1)
+                valid_images = torch.any((image_input_idx >= 0).view(batch_size, num_image, num_patch), dim=-1)
                 valid_images = valid_images.to(attention_mask.dtype)
                 attention_mask = torch.cat(
                     [attention_mask[:, :1], valid_images, attention_mask[:, 1:-num_image]],
         # normalized
         if self.config.normalize_input_embeds:
+            x = x * (self.config.d_model**0.5)
         # Transform the attention mask into what the blocks expect.
         if attention_mask is not None:
             # shape: (batch_size, 1, 1, seq_len)
             if len(attention_mask.shape) == 2:
+                attention_mask = attention_mask[:, : past_length + seq_len]
                 attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[:, None, None, :]
             else:
                 attention_mask = attention_mask.unsqueeze(1).to(dtype=torch.float)
             layer_past = None if past_key_values is None else past_key_values[block_idx]
             # shape: (batch_size, seq_len, d_model)
+            x, cache = layer(
+                x,
+                attention_bias=attention_bias,
+                position_ids=position_ids,
+                drop_mask=response_mask,
+                layer_past=layer_past,
+                use_cache=use_cache,
+            )
             if attn_key_values is not None:
                 assert cache is not None
                 attn_key_values.append(cache)
         if images is not None and self.config.use_cls_feature:
             assert num_image is not None
             x = torch.cat(
+                [x[:, :1], x[:, num_image + 1 :], torch.zeros_like(x[:, :num_image])],
                 dim=1,
             )
             # shape: (batch_size, 1, d_model)
             if append_last_valid_logits is not None:
                 last_valid_output = x[
+                    torch.arange(x.shape[0], device=x.device), append_last_valid_logits.to(x.device)
+                ]
                 x = last_valid_output.unsqueeze(1)
             else:
                 x = x[:, -1, :].unsqueeze(1)
         return MolmoOutput(
             last_hidden_states=x,
             attn_key_values=attn_key_values,
+            hidden_states=tuple(all_hidden_states) if output_hidden_states else None,
+        )
 class MolmoForCausalLM(PreTrainedModel):
     """
     Extremely barebones HF model wrapper.
     """
     config_class = MolmoConfig
     base_model_prefix = "model"
     _no_split_modules = ["MolmoDecoderLayer"]
+    def __init__(self, config: MolmoConfig):
         super().__init__(config)
         # model_config = create_model_config_from_pretrained_config(config)
         # Initialize model (always on CPU to start with so we don't run out of GPU memory).
             output_hidden_states=output_hidden_states,
             append_last_valid_logits=append_last_valid_logits,
         )
         x = outputs.last_hidden_states
         if self.config.weight_tying:
             logits = F.linear(x, self.model.transformer.wte.weight, None)  # type: ignore
         if self.config.scale_logits:
             logits.mul_(1 / math.sqrt(self.config.d_model))
         if self.config.final_logit_softcapping is not None:
             logits = logits / self.config.final_logit_softcapping
             logits = torch.tanh(logits)
             logits = logits * self.config.final_logit_softcapping
         if not last_logits_only and append_last_valid_logits is not None:
             last_valid_logit = logits[
+                torch.arange(logits.shape[0], device=logits.device), append_last_valid_logits
+            ]
             logits = torch.cat([logits[:, :-1], last_valid_logit[:, None]], dim=1)
         loss = None
                 labels.masked_fill_(~(loss_masks > 0), -100)
                 labels = labels.view(-1)
                 logits_for_loss = logits.to(torch.float32).view(-1, logits.size(-1))
+                loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction="none")
                 loss = loss_fct(logits_for_loss, labels)
                 loss = loss.view(input_ids.shape[0], -1)
                 loss = loss * loss_masks
         append_last_valid_logits: Optional[torch.Tensor] = None
         if self.config.use_position_ids and attention_mask is None:
             attention_mask = input_ids != -1
+            position_ids = torch.clamp(torch.cumsum(attention_mask.to(torch.int32), dim=-1) - 1, min=0)
             append_last_valid_logits = attention_mask.long().sum(dim=-1) - 1
             attention_mask = torch.cat(
                 [attention_mask, attention_mask.new_ones((batch_size, max_new_tokens))],
             )
         if attention_mask is not None:
             assert attention_mask.shape == (batch_size, mask_len)
         out = super().generate(
             input_ids,
             generation_config,
         )
         return out
     def prepare_inputs_for_generation(
         self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs
     ):
                 model_inputs["image_masks"] = image_masks
                 model_inputs["image_input_idx"] = image_input_idx
                 model_inputs["append_last_valid_logits"] = append_last_valid_logits
+        else:
             model_inputs = {"input_ids": input_ids, "past_key_values": past_key_values}
             model_inputs.update(kwargs)
         # Tie weights again if needed
         self.tie_weights()
+        return model_embeds