allenai
/

Molmo-7B-O-0924

@@ -725,17 +725,6 @@ def _expand_token(token, batch_size: int):
     return token.view(1, 1, -1).expand(batch_size, -1, -1)
-class LayerNormFp32(nn.LayerNorm):
-    """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back).
-    Derived from https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/transformer.py.
-    """
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        orig_type = x.dtype
-        x = F.layer_norm(x.to(torch.float32), self.normalized_shape, self.weight, self.bias, self.eps)
-        return x.to(orig_type)
 class ViTMLP(nn.Module):
     def __init__(self, config: FullMolmoConfig):
         super().__init__()
@@ -855,10 +844,9 @@ class VisionTransformer(nn.Module):
             device=config.init_device,
             )
-        self.pre_ln = LayerNormFp32(
             v_cfg.image_emb_dim,
             eps=v_cfg.image_norm_eps,
-            device=config.init_device,
         )
         self.transformer = BlockCollection(config)
@@ -1013,6 +1001,8 @@ class MultiHeadDotProductAttention(nn.Module):
             attn_output = torch.einsum("...hqk,...khd->...qhd", attn_weights.to(xv.dtype), xv)
         elif self.config.attention_type == "sdpa":
             attn_output = F.scaled_dot_product_attention(
                 xq.transpose(1, 2).contiguous(),
                 xk.transpose(1, 2).contiguous(),
@@ -1389,8 +1379,8 @@ class OLMoPretrainedVisionBackbone(OLMoVisionBackbone):
             elif cfg.image_padding_embed == "pad_and_partial_pad":
                 pad_embed = self.pad_embed[:, None, None, None, :]
                 all_pad = image_masks == 0
-                partial_pad = torch.logical_and(image_masks < 1, torch.logical_not(all_pad)).to(dtype=torch.float32)
-                all_pad = all_pad.to(dtype=torch.float32)
                 image_features = image_features + pad_embed[0] * torch.unsqueeze(all_pad, -1)
                 image_features = image_features + pad_embed[1] * torch.unsqueeze(partial_pad, -1)
             else:
@@ -1769,6 +1759,7 @@ class Molmo(nn.Module):
             for block_group in self.transformer.block_groups:
                 block_group.reset_parameters()
     def forward(
         self,
         input_ids: torch.LongTensor,
@@ -2070,6 +2061,7 @@ class MolmoForCausalLM(PreTrainedModel):
         else:
             self.model = model
     def forward(
         self,
         input_ids: torch.LongTensor = None,

     return token.view(1, 1, -1).expand(batch_size, -1, -1)
 class ViTMLP(nn.Module):
     def __init__(self, config: FullMolmoConfig):
         super().__init__()
             device=config.init_device,
             )
+        self.pre_ln = nn.LayerNorm(
             v_cfg.image_emb_dim,
             eps=v_cfg.image_norm_eps,
         )
         self.transformer = BlockCollection(config)
             attn_output = torch.einsum("...hqk,...khd->...qhd", attn_weights.to(xv.dtype), xv)
         elif self.config.attention_type == "sdpa":
+            if self.config.float32_attention and not torch.is_autocast_enabled():
+                xv = xv.to(torch.float32)
             attn_output = F.scaled_dot_product_attention(
                 xq.transpose(1, 2).contiguous(),
                 xk.transpose(1, 2).contiguous(),
             elif cfg.image_padding_embed == "pad_and_partial_pad":
                 pad_embed = self.pad_embed[:, None, None, None, :]
                 all_pad = image_masks == 0
+                partial_pad = torch.logical_and(image_masks < 1, torch.logical_not(all_pad)).to(dtype=image_features.dtype)
+                all_pad = all_pad.to(dtype=image_features.dtype)
                 image_features = image_features + pad_embed[0] * torch.unsqueeze(all_pad, -1)
                 image_features = image_features + pad_embed[1] * torch.unsqueeze(partial_pad, -1)
             else:
             for block_group in self.transformer.block_groups:
                 block_group.reset_parameters()
     def forward(
         self,
         input_ids: torch.LongTensor,
         else:
             self.model = model
     def forward(
         self,
         input_ids: torch.LongTensor = None,