chrisc36 commited on
Commit
95eb07f
1 Parent(s): caeff8c

Add files using upload-large-folder tool

Browse files
config.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
- "_name_or_path": "/data/chris/hf/7b-v3",
3
  "architectures": [
4
- "MOLMoForCausalLM"
5
  ],
 
6
  "auto_map": {
7
  "AutoConfig": "config_molmo.MolmoConfig",
8
- "AutoModelForCausalLM": "modeling_molmo.MOLMoForCausalLM"
9
  },
10
  "clip_qkv": null,
11
  "embedding_size": 152064,
@@ -13,8 +13,10 @@
13
  "initializer_range": 0.02,
14
  "intermediate_size": 37888,
15
  "layer_norm_eps": 1e-06,
 
16
  "max_position_embeddings": 4096,
17
  "model_type": "molmo",
 
18
  "num_attention_heads": 28,
19
  "num_hidden_layers": 28,
20
  "num_key_value_heads": 4,
@@ -27,4 +29,4 @@
27
  "use_position_ids": true,
28
  "vocab_size": 152064,
29
  "weight_tying": false
30
- }
 
1
  {
 
2
  "architectures": [
3
+ "MolmoForCausalLM"
4
  ],
5
+ "attention_layer_norm": false,
6
  "auto_map": {
7
  "AutoConfig": "config_molmo.MolmoConfig",
8
+ "AutoModelForCausalLM": "modeling_molmo.MolmoForCausalLM"
9
  },
10
  "clip_qkv": null,
11
  "embedding_size": 152064,
 
13
  "initializer_range": 0.02,
14
  "intermediate_size": 37888,
15
  "layer_norm_eps": 1e-06,
16
+ "layer_norm_type": "rms",
17
  "max_position_embeddings": 4096,
18
  "model_type": "molmo",
19
+ "norm_after": false,
20
  "num_attention_heads": 28,
21
  "num_hidden_layers": 28,
22
  "num_key_value_heads": 4,
 
29
  "use_position_ids": true,
30
  "vocab_size": 152064,
31
  "weight_tying": false
32
+ }
config_molmo.py CHANGED
@@ -26,6 +26,9 @@ class MolmoConfig(PretrainedConfig):
26
  weight_tying: bool = False,
27
  use_position_ids: bool=True,
28
  tie_word_embeddings: bool=True,
 
 
 
29
  **kwargs,
30
  ):
31
  self.vocab_size = vocab_size
@@ -38,18 +41,16 @@ class MolmoConfig(PretrainedConfig):
38
  self.layer_norm_eps = layer_norm_eps
39
  self.weight_tying = weight_tying
40
  self.use_position_ids = use_position_ids
41
-
42
- # for backward compatibility
43
- if num_key_value_heads is None:
44
- num_key_value_heads = num_attention_heads
45
-
46
  self.num_key_value_heads = num_key_value_heads
47
  self.initializer_range = initializer_range
48
  self.use_cache = use_cache
49
  self.rope_theta = rope_theta
50
  self.clip_qkv = clip_qkv
51
  self.qkv_bias = qkv_bias
 
52
  self.tie_word_embeddings = tie_word_embeddings
 
53
 
54
  super().__init__(
55
  tie_word_embeddings=tie_word_embeddings,
 
26
  weight_tying: bool = False,
27
  use_position_ids: bool=True,
28
  tie_word_embeddings: bool=True,
29
+ attention_layer_norm: bool=False,
30
+ norm_after: bool = False,
31
+ layer_norm_type: str="rms",
32
  **kwargs,
33
  ):
34
  self.vocab_size = vocab_size
 
41
  self.layer_norm_eps = layer_norm_eps
42
  self.weight_tying = weight_tying
43
  self.use_position_ids = use_position_ids
44
+ self.attention_layer_norm = attention_layer_norm
 
 
 
 
45
  self.num_key_value_heads = num_key_value_heads
46
  self.initializer_range = initializer_range
47
  self.use_cache = use_cache
48
  self.rope_theta = rope_theta
49
  self.clip_qkv = clip_qkv
50
  self.qkv_bias = qkv_bias
51
+ self.norm_after = norm_after
52
  self.tie_word_embeddings = tie_word_embeddings
53
+ self.layer_norm_type = layer_norm_type
54
 
55
  super().__init__(
56
  tie_word_embeddings=tie_word_embeddings,
model-00007-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:225404ed62c82967fdbaf7a17a8024d0b1af37c55f7469b2196b5c7943c93955
3
- size 3799830480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c84ff3f7adcfdf9eec4247291ca1fcad02cf7005c84801f31223711df54846a
3
+ size 3799846968
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 32084084736
4
  },
5
  "weight_map": {
6
  "model.transformer.blocks.0.att_proj.bias": "model-00001-of-00007.safetensors",
@@ -586,6 +586,7 @@
586
  "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
587
  "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
588
  "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.bias": "model-00007-of-00007.safetensors",
589
- "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.weight": "model-00007-of-00007.safetensors"
 
590
  }
591
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 32084101120
4
  },
5
  "weight_map": {
6
  "model.transformer.blocks.0.att_proj.bias": "model-00001-of-00007.safetensors",
 
586
  "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
587
  "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
588
  "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.bias": "model-00007-of-00007.safetensors",
589
+ "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.weight": "model-00007-of-00007.safetensors",
590
+ "model.vision_backbone.pad_embed": "model-00007-of-00007.safetensors"
591
  }
592
  }
modeling_molmo.py CHANGED
@@ -77,7 +77,7 @@ def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: b
77
  x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
78
 
79
 
80
- class OLMoConfigurationError(Exception):
81
  pass
82
 
83
 
@@ -189,7 +189,7 @@ class RotaryEmbedding(nn.Module):
189
  return q_.type_as(q), k_.type_as(k)
190
 
191
 
192
- class OLMoBlock(nn.Module):
193
  """
194
  A base class for transformer block implementations.
195
  """
@@ -420,17 +420,17 @@ class OLMoBlock(nn.Module):
420
  @classmethod
421
  def build(cls, layer_id: int, config: MolmoConfig, cache: BufferCache):
422
  if config.block_type == "sequential":
423
- return OLMoSequentialBlock(layer_id, config, cache)
424
  elif config.block_type == "llama":
425
  return OLMoLlamaBlock(layer_id, config, cache)
426
  else:
427
  raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
428
 
429
 
430
- class OLMoLlamaBlock(OLMoBlock):
431
  """
432
  This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
433
- (plus another skip connection). This block is similar to `OLMoSequentialBlock`
434
  but some operations have slightly different implementations to imitate the
435
  behavior of Llama.
436
  """
@@ -598,7 +598,7 @@ class OLMoLlamaBlock(OLMoBlock):
598
  return x, cache
599
 
600
 
601
- class OLMoSequentialBlock(OLMoBlock):
602
  """
603
  This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
604
  (plus another skip connection).
@@ -825,7 +825,6 @@ class VisionBackboneConfig:
825
  class FullMolmoConfig:
826
  d_model: int = 768
827
  n_heads: int = 12
828
- head_dim: int = 64
829
  n_kv_heads: Optional[int] = None
830
  qkv_bias: bool = False
831
  clip_qkv: Optional[float] = None
@@ -908,7 +907,7 @@ class FullMolmoConfig:
908
  if self.n_kv_heads == n_kv_heads_should_be:
909
  return n_kv_heads_should_be
910
  else:
911
- raise OLMoConfigurationError(
912
  "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
913
  )
914
 
@@ -1897,7 +1896,7 @@ class LayerNorm(LayerNormBase):
1897
  return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
1898
 
1899
 
1900
- class MOLMo(nn.Module):
1901
  def __init__(self, config: FullMolmoConfig, init_params: bool = True):
1902
  super().__init__()
1903
  self.config = config
@@ -1906,7 +1905,7 @@ class MOLMo(nn.Module):
1906
  # Validate config.
1907
  if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
1908
  if self.config.embedding_size < self.config.vocab_size:
1909
- raise OLMoConfigurationError("embedding size should be at least as big as vocab size")
1910
  elif self.config.embedding_size % 128 != 0:
1911
  import warnings
1912
 
@@ -1939,7 +1938,7 @@ class MOLMo(nn.Module):
1939
  )
1940
  )
1941
 
1942
- blocks = [OLMoBlock.build(i, config, self.__cache) for i in range(config.n_layers)]
1943
  if self.config.block_group_size > 1:
1944
  raise NotImplementedError()
1945
  else:
@@ -2018,16 +2017,20 @@ class MOLMo(nn.Module):
2018
  which input IDs are masked. A `1` value in the mask means that
2019
  the corresponding input ID should *not* be ignored. A `0` means
2020
  that the corresponding input ID is masked.
 
2021
  This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
2022
  library.
2023
  :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
2024
  `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
2025
  to introduce causal or other biases.
 
2026
  If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
2027
  indicates that the i-th element in the sequence is allowed to attend to the j-th
2028
  element in the sequence.
 
2029
  If the tensor is a float tensor, it will just be added to the attention
2030
  scores before the softmax.
 
2031
  The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
2032
  :param response_mask: A tensor of shape `(batch_size, seq_len)` that indicates
2033
  the response mask. A `1` value in the mask means that the corresponding token
@@ -2258,20 +2261,24 @@ class MOLMo(nn.Module):
2258
  return ModelOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type]
2259
 
2260
 
2261
- class MOLMoForCausalLM(PreTrainedModel):
2262
  config_class = MolmoConfig
2263
  base_model_prefix = "model"
2264
- _no_split_modules = ["OLMoBlock"]
2265
 
2266
- def __init__(self, config: MolmoConfig, model: Optional[MOLMo] = None, init_params: bool = False):
2267
  super().__init__(config)
2268
 
2269
  if not model:
2270
  full_config = FullMolmoConfig(
 
 
 
2271
  rope_impl="llama",
2272
  vocab_size=config.vocab_size,
2273
  max_sequence_length=config.max_position_embeddings,
2274
  qkv_bias=config.qkv_bias,
 
2275
  embedding_size=config.embedding_size,
2276
  attention_type="sdpa",
2277
  embedding_dropout=0,
@@ -2287,9 +2294,9 @@ class MOLMoForCausalLM(PreTrainedModel):
2287
  additional_vocab_size=128,
2288
  n_heads=config.num_attention_heads,
2289
  n_kv_heads=config.num_key_value_heads,
2290
- rope_theta=1000000.0,
2291
- layer_norm_eps=1e-6,
2292
- layer_norm_type="rms",
2293
  pad_tokenizer=True,
2294
  vit_layers=[-2, -9],
2295
  vision_backbone=VisionBackboneConfig(
@@ -2312,7 +2319,7 @@ class MOLMoForCausalLM(PreTrainedModel):
2312
  initializer_range=0.02,
2313
  )
2314
  )
2315
- self.model = MOLMo(full_config, init_params=init_params)
2316
  else:
2317
  self.model = model
2318
 
@@ -2345,7 +2352,7 @@ class MOLMoForCausalLM(PreTrainedModel):
2345
  use_cache = self.config.use_cache
2346
 
2347
  if output_attentions:
2348
- raise ValueError("output_attentions is not yet supported in OLMo")
2349
 
2350
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2351
 
@@ -2524,16 +2531,6 @@ class MOLMoForCausalLM(PreTrainedModel):
2524
  model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
2525
  return model_kwargs
2526
 
2527
- # TODO: these are required to make the implementation complete.
2528
- # def resize_position_embeddings(self, new_num_position_embeddings: int):
2529
- # pass
2530
- #
2531
- # def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]:
2532
- # pass
2533
- #
2534
- # def _reorder_cache(self, past_key_values, beam_idx):
2535
- # pass
2536
-
2537
  def get_input_embeddings(self) -> torch.nn.Module:
2538
  return self.model.transformer.wte
2539
 
@@ -2555,11 +2552,13 @@ class MOLMoForCausalLM(PreTrainedModel):
2555
  def tie_weights(self):
2556
  """
2557
  This function is intentionally left as a no-op.
 
2558
  Weight tying is handled as follows:
2559
  - When the model is initialized, the `ff_out` layer is conditionally defined based on the `weight_tying` configuration.
2560
  See: `if not config.weight_tying: self.transformer.update(...)` in `olmo/model.py`.
2561
  - When computing logits, the `wte` weights are used directly if `weight_tying` is enabled.
2562
  See: `if self.config.weight_tying: logits = F.linear(x, self.transformer.wte.weight, None)` in the `forward` method.
 
2563
  Therefore, there is no need to explicitly tie the weights in this function.
2564
  """
2565
  pass
@@ -2569,7 +2568,9 @@ class MOLMoForCausalLM(PreTrainedModel):
2569
  ) -> torch.nn.Embedding:
2570
  """
2571
  Resizes input token embeddings matrix of the model if `new_num_tokens != config.embedding_size`.
 
2572
  Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
 
2573
  Arguments:
2574
  new_num_tokens (`int`, *optional*):
2575
  The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
@@ -2578,12 +2579,15 @@ class MOLMoForCausalLM(PreTrainedModel):
2578
  pad_to_multiple_of (`int`, *optional*):
2579
  If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
2580
  `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
 
2581
  This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
2582
  `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
2583
  details about this, or help on choosing the correct value for resizing, refer to this guide:
2584
  https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
 
2585
  Return:
2586
  `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
 
2587
  Note:
2588
  This method differs from the base class implementation by resizing the `embedding_size` attribute of the
2589
  model configuration instead of the `vocab_size`. It also includes a warning if the resized `embedding_size`
@@ -2614,4 +2618,4 @@ class MOLMoForCausalLM(PreTrainedModel):
2614
 
2615
 
2616
  # Always register for multi-modal features
2617
- AutoModelForCausalLM.register(MolmoConfig, MOLMoForCausalLM)
 
77
  x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
78
 
79
 
80
+ class MolmoConfigurationError(Exception):
81
  pass
82
 
83
 
 
189
  return q_.type_as(q), k_.type_as(k)
190
 
191
 
192
+ class MolmoBlock(nn.Module):
193
  """
194
  A base class for transformer block implementations.
195
  """
 
420
  @classmethod
421
  def build(cls, layer_id: int, config: MolmoConfig, cache: BufferCache):
422
  if config.block_type == "sequential":
423
+ return MolmoSequentialBlock(layer_id, config, cache)
424
  elif config.block_type == "llama":
425
  return OLMoLlamaBlock(layer_id, config, cache)
426
  else:
427
  raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
428
 
429
 
430
+ class OLMoLlamaBlock(MolmoBlock):
431
  """
432
  This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
433
+ (plus another skip connection). This block is similar to `MolmoSequentialBlock`
434
  but some operations have slightly different implementations to imitate the
435
  behavior of Llama.
436
  """
 
598
  return x, cache
599
 
600
 
601
+ class MolmoSequentialBlock(MolmoBlock):
602
  """
603
  This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
604
  (plus another skip connection).
 
825
  class FullMolmoConfig:
826
  d_model: int = 768
827
  n_heads: int = 12
 
828
  n_kv_heads: Optional[int] = None
829
  qkv_bias: bool = False
830
  clip_qkv: Optional[float] = None
 
907
  if self.n_kv_heads == n_kv_heads_should_be:
908
  return n_kv_heads_should_be
909
  else:
910
+ raise MolmoConfigurationError(
911
  "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
912
  )
913
 
 
1896
  return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
1897
 
1898
 
1899
+ class Molmo(nn.Module):
1900
  def __init__(self, config: FullMolmoConfig, init_params: bool = True):
1901
  super().__init__()
1902
  self.config = config
 
1905
  # Validate config.
1906
  if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
1907
  if self.config.embedding_size < self.config.vocab_size:
1908
+ raise MolmoConfigurationError("embedding size should be at least as big as vocab size")
1909
  elif self.config.embedding_size % 128 != 0:
1910
  import warnings
1911
 
 
1938
  )
1939
  )
1940
 
1941
+ blocks = [MolmoBlock.build(i, config, self.__cache) for i in range(config.n_layers)]
1942
  if self.config.block_group_size > 1:
1943
  raise NotImplementedError()
1944
  else:
 
2017
  which input IDs are masked. A `1` value in the mask means that
2018
  the corresponding input ID should *not* be ignored. A `0` means
2019
  that the corresponding input ID is masked.
2020
+
2021
  This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
2022
  library.
2023
  :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
2024
  `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
2025
  to introduce causal or other biases.
2026
+
2027
  If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
2028
  indicates that the i-th element in the sequence is allowed to attend to the j-th
2029
  element in the sequence.
2030
+
2031
  If the tensor is a float tensor, it will just be added to the attention
2032
  scores before the softmax.
2033
+
2034
  The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
2035
  :param response_mask: A tensor of shape `(batch_size, seq_len)` that indicates
2036
  the response mask. A `1` value in the mask means that the corresponding token
 
2261
  return ModelOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type]
2262
 
2263
 
2264
+ class MolmoForCausalLM(PreTrainedModel):
2265
  config_class = MolmoConfig
2266
  base_model_prefix = "model"
2267
+ _no_split_modules = ["MolmoBlock"]
2268
 
2269
+ def __init__(self, config: MolmoConfig, model: Optional[Molmo] = None, init_params: bool = False):
2270
  super().__init__(config)
2271
 
2272
  if not model:
2273
  full_config = FullMolmoConfig(
2274
+ attention_layer_norm=config.attention_layer_norm,
2275
+ image_padding_embed="pad_and_partial_pad",
2276
+ image_pooling_2d="attention-meanq",
2277
  rope_impl="llama",
2278
  vocab_size=config.vocab_size,
2279
  max_sequence_length=config.max_position_embeddings,
2280
  qkv_bias=config.qkv_bias,
2281
+ norm_after=config.norm_after,
2282
  embedding_size=config.embedding_size,
2283
  attention_type="sdpa",
2284
  embedding_dropout=0,
 
2294
  additional_vocab_size=128,
2295
  n_heads=config.num_attention_heads,
2296
  n_kv_heads=config.num_key_value_heads,
2297
+ rope_theta=config.rope_theta,
2298
+ layer_norm_eps=config.layer_norm_eps,
2299
+ layer_norm_type=config.layer_norm_type,
2300
  pad_tokenizer=True,
2301
  vit_layers=[-2, -9],
2302
  vision_backbone=VisionBackboneConfig(
 
2319
  initializer_range=0.02,
2320
  )
2321
  )
2322
+ self.model = Molmo(full_config, init_params=init_params)
2323
  else:
2324
  self.model = model
2325
 
 
2352
  use_cache = self.config.use_cache
2353
 
2354
  if output_attentions:
2355
+ raise ValueError("output_attentions is not yet supported in Molmo")
2356
 
2357
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2358
 
 
2531
  model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
2532
  return model_kwargs
2533
 
 
 
 
 
 
 
 
 
 
 
2534
  def get_input_embeddings(self) -> torch.nn.Module:
2535
  return self.model.transformer.wte
2536
 
 
2552
  def tie_weights(self):
2553
  """
2554
  This function is intentionally left as a no-op.
2555
+
2556
  Weight tying is handled as follows:
2557
  - When the model is initialized, the `ff_out` layer is conditionally defined based on the `weight_tying` configuration.
2558
  See: `if not config.weight_tying: self.transformer.update(...)` in `olmo/model.py`.
2559
  - When computing logits, the `wte` weights are used directly if `weight_tying` is enabled.
2560
  See: `if self.config.weight_tying: logits = F.linear(x, self.transformer.wte.weight, None)` in the `forward` method.
2561
+
2562
  Therefore, there is no need to explicitly tie the weights in this function.
2563
  """
2564
  pass
 
2568
  ) -> torch.nn.Embedding:
2569
  """
2570
  Resizes input token embeddings matrix of the model if `new_num_tokens != config.embedding_size`.
2571
+
2572
  Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
2573
+
2574
  Arguments:
2575
  new_num_tokens (`int`, *optional*):
2576
  The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
 
2579
  pad_to_multiple_of (`int`, *optional*):
2580
  If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
2581
  `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
2582
+
2583
  This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
2584
  `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
2585
  details about this, or help on choosing the correct value for resizing, refer to this guide:
2586
  https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
2587
+
2588
  Return:
2589
  `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
2590
+
2591
  Note:
2592
  This method differs from the base class implementation by resizing the `embedding_size` attribute of the
2593
  model configuration instead of the `vocab_size`. It also includes a warning if the resized `embedding_size`
 
2618
 
2619
 
2620
  # Always register for multi-modal features
2621
+ AutoModelForCausalLM.register(MolmoConfig, MolmoForCausalLM)
preprocessing_molmo.py CHANGED
@@ -2,9 +2,7 @@
2
  Processor class for Molmo.
3
  """
4
 
5
- from typing import List, Union, Optional
6
-
7
- from transformers.utils.constants import OPENAI_CLIP_STD, OPENAI_CLIP_MEAN
8
 
9
  try:
10
  from typing import Unpack
 
2
  Processor class for Molmo.
3
  """
4
 
5
+ from typing import Optional
 
 
6
 
7
  try:
8
  from typing import Unpack