MolmoE-1B-0924 / constants.py
Muennighoff's picture
Cp over files
18652d8
raw
history blame
18.3 kB
DEFAULT_IMAGE_PATCH_TOKEN = f"<im_patch>"
DEFAULT_IM_START_TOKEN = f"<im_start>"
DEFAULT_IM_END_TOKEN = f"<im_end>"
DEFAULT_IM_COL_TOKEN = f"<im_col>"
IMAGE_PROMPT = "<|image|>"
EXTRA_TOKENS = (DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_COL_TOKEN, IMAGE_PROMPT)
VIT_STANDARD_CONFIGS = {
"dinov2-large": {
"image_emb_dim": 1024,
"image_mlp_dim": 4096,
'image_patch_size': 14,
'image_pos_patch_size': 14,
'image_num_layers': 24,
'image_num_heads': 16,
'image_num_key_value_heads': 16,
'image_head_dim': 64,
'image_mlp_activations': 'gelu',
'image_default_input_size': (224, 224),
'image_num_pos': 257,
'image_norm_eps': 1e-6,
"image_model_type": "dino"
},
"SigLIP-So400m-14-384": {
"image_emb_dim": 1152,
'image_num_layers': 27,
"image_mlp_dim": 4304,
'image_patch_size': 14,
'image_pos_patch_size': 14,
'image_num_heads': 16,
'image_num_key_value_heads': 16,
'image_head_dim': 72,
'image_mlp_activations': 'gelu',
# Although it is called "384" that seems to be an error of the author's
# part, it actually only handles 378 inputs
'image_default_input_size': (378, 378),
'image_num_pos': 729, # note not CLS token
'image_norm_eps': 1e-6,
"image_model_type": "siglip",
"resize_mode": "siglip"
},
"DFN5B-CLIP-ViT-H-14-378": {
"image_emb_dim": 1280,
'image_patch_size': 14,
'image_pos_patch_size': 14,
'image_num_layers': 32,
'image_num_heads': 16,
'image_num_key_value_heads': 16,
'image_head_dim': 80,
'image_mlp_dim': 5120,
'image_dropout_rate': 0.0,
'image_mlp_activations': 'quick_gelu',
'image_default_input_size': (378, 378),
'image_num_pos': 730,
'image_norm_eps': 1e-5,
"image_model_type": "openai",
"resize_mode": "no_aspect_ratio"
},
'ViT-L/14-336': {
'image_patch_size': 14,
'image_pos_patch_size': 14,
'image_emb_dim': 1024,
'image_num_heads': 16,
'image_num_layers': 23,
'image_head_dim': 64,
'image_mlp_dim': 4096,
'image_mlp_activations': 'quick_gelu',
'image_dropout_rate': 0.0,
'image_num_pos': 577,
'image_default_input_size': (336, 336),
'image_norm_eps': 1e-5,
'image_num_key_value_heads': 16,
"image_model_type": "openai"
},
'EVA02-L-14-336': {
'image_patch_size': 14,
'image_pos_patch_size': 14,
'image_emb_dim': 1024,
'image_num_heads': 16,
'image_num_layers': 24,
'image_head_dim': 64,
'image_mlp_dim': 2730,
'image_mlp_activations': 'silu',
'image_dropout_rate': 0.0,
'image_num_pos': 577,
'image_default_input_size': (336, 336),
'image_norm_eps': 1e-6,
'image_num_key_value_heads': 16,
"image_model_type": "eva"
},
'ViT-L/14': {
'image_patch_size': 14,
'image_pos_patch_size': 14,
'image_emb_dim': 1024,
'image_num_heads': 16,
# Note the original model has 24 layers, but we don't use the last layer
'image_num_layers': 23,
'image_head_dim': 64,
'image_mlp_dim': 4096,
'image_mlp_activations': 'quick_gelu',
'image_dropout_rate': 0.0,
'image_num_pos': 257,
'image_default_input_size': (224, 224),
'image_norm_eps': 1e-5,
'image_num_key_value_heads': 16,
"image_model_type": "openai"
},
'debug': {
'image_patch_size': 14,
'image_pos_patch_size': 14,
'image_emb_dim': 1024,
'image_num_heads': 16,
'image_num_layers': 1,
'image_head_dim': 64,
'image_mlp_dim': 1024,
'image_mlp_activations': 'quick_gelu',
'image_dropout_rate': 0.0,
'image_num_pos': 577,
'image_default_input_size': (336, 336),
'image_norm_eps': 1e-5,
'image_num_key_value_heads': 16,
"image_model_type": "openai"
}
}
OPEN_LLM_STANDARD_CONFIGS = {
"qwen1.5_7b": {
'vocab_size': 151936,
'hidden_size': 4096,
'intermediate_size': 11008,
'num_hidden_layers': 32,
'num_attention_heads': 32,
'num_key_value_heads': 32,
'max_sequence_length': 2048,
'max_position_embeddings': 32768,
'rope_theta': 1000000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-6,
"qkv_bias": True,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "hf-Qwen/Qwen1.5-7B",
},
"qwen1.5_14b": {
'vocab_size': 152064,
'hidden_size': 5120,
'intermediate_size': 13696,
'num_hidden_layers': 40,
'num_attention_heads': 40,
'num_key_value_heads': 40,
'max_sequence_length': 2048,
'max_position_embeddings': 32768,
'rope_theta': 1000000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-6,
"qkv_bias": True,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "hf-Qwen/Qwen1.5-14B",
},
"qwen1.5_32b": {
"vocab_size": 152064,
"hidden_size": 5120,
"intermediate_size": 27392,
"num_hidden_layers": 64,
"num_attention_heads": 40,
"num_key_value_heads": 8,
'max_sequence_length': 2048,
'max_position_embeddings': 32768,
"rope_theta": 1000000.0,
'initializer_range': 0.02,
"rms_norm_eps": 1e-6,
"qkv_bias": True,
"tie_word_embeddings": False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "hf-Qwen/Qwen1.5-32B",
},
'llama_7b': {
'vocab_size': 32000,
'hidden_size': 4096,
'intermediate_size': 11008,
'num_hidden_layers': 32,
'num_attention_heads': 32,
'num_key_value_heads': 32,
'max_sequence_length': 2048,
'max_position_embeddings': 8192,
'rope_theta': 10000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "llama"
},
'yi_6b': {
'vocab_size': 64000,
'hidden_size': 4096,
'intermediate_size': 11008,
'num_hidden_layers': 32,
'num_attention_heads': 32,
'num_key_value_heads': 4,
'max_sequence_length': 4096,
'max_position_embeddings': 4096,
'rope_theta': 5000000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "yi"
},
'yi_9b': {
'vocab_size': 64000,
'hidden_size': 4096,
'intermediate_size': 11008,
'num_hidden_layers': 48,
'num_attention_heads': 32,
'num_key_value_heads': 4,
'max_sequence_length': 4096,
'max_position_embeddings': 4096,
'rope_theta': 10000,
'initializer_range': 0.02,
'rms_norm_eps': 1e-06,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "yi"
},
'yi_34b': {
'vocab_size': 64000,
'hidden_size': 7168,
'intermediate_size': 20480,
'num_hidden_layers': 60,
'num_attention_heads': 56,
'num_key_value_heads': 8,
'max_sequence_length': 4096,
'max_position_embeddings': 4096,
'rope_theta': 5000000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "yi"
},
"olmo_1b": {
'vocab_size': 50304,
'hidden_size': 2048,
'intermediate_size': 8192,
'num_hidden_layers': 16,
'num_attention_heads': 16,
'num_key_value_heads': 16,
'max_sequence_length': 4096,
'max_position_embeddings': 32768,
'rope_theta': 10000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': True,
'hidden_act': 'silu',
'norm_module': 'OlmoLayerNorm',
"tokenizer": "hf-allenai/OLMo-1B"
},
"olmo_7b": {
'vocab_size': 50304,
'hidden_size': 4096,
'intermediate_size': 22016//2,
'num_hidden_layers': 32,
'num_attention_heads': 32,
'num_key_value_heads': 32,
'max_sequence_length': 4096,
'max_position_embeddings': 32768,
'rope_theta': 10000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'OlmoLayerNorm',
"tokenizer": "hf-allenai/OLMo-7B",
},
"olmo_1.7_7b": {
'vocab_size': 50304,
'hidden_size': 4096,
'intermediate_size': 22016//2,
'num_hidden_layers': 32,
'num_attention_heads': 32,
'num_key_value_heads': 32,
'max_sequence_length': 4096,
'max_position_embeddings': 32768,
'rope_theta': 10000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': False,
'hidden_act': 'silu',
"qkv_clip": 8,
'norm_module': 'OlmoLayerNorm',
"tokenizer": "hf-allenai/OLMo-1.7-7B",
},
'mistral_7b': {
'vocab_size': 32000,
'hidden_size': 4096,
'intermediate_size': 14336,
'num_hidden_layers': 32,
'num_attention_heads': 32,
'num_key_value_heads': 8,
'max_sequence_length': 4096,
'max_position_embeddings': 32768,
'rope_theta': 10000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "mistral"
},
'mistral0.3_7b': {
'vocab_size': 32768,
'hidden_size': 4096,
'intermediate_size': 14336,
'num_hidden_layers': 32,
'num_attention_heads': 32,
'num_key_value_heads': 8,
'max_sequence_length': 4096,
'max_position_embeddings': 32768,
'rope_theta': 1000000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "mistral0.3"
},
"mistral0.2_22b": {
'vocab_size': 32000,
'hidden_size': 6144,
'intermediate_size': 16384,
'num_hidden_layers': 56,
'num_attention_heads': 48,
'num_key_value_heads': 8,
'max_sequence_length': 4096,
'max_position_embeddings': 32768,
'rope_theta': 1000000,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "mistral"
},
'llama_13b': {
'vocab_size': 32000,
'hidden_size': 5120,
'intermediate_size': 13824,
'num_hidden_layers': 40,
'num_attention_heads': 40,
'num_key_value_heads': 40,
'max_sequence_length': 2048,
'max_position_embeddings': 8192,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': False,
'hidden_act': 'silu',
"norm_module": 'RMSNorm',
'rope_theta': 10000.0,
"tokenizer": "llama"
},
'llama_70b': {
'vocab_size': 32000,
'hidden_size': 8192,
'intermediate_size': 28672,
'num_hidden_layers': 80,
'num_attention_heads': 64,
'num_key_value_heads': 8,
'max_sequence_length': 8192,
'max_position_embeddings': 8192,
'rope_theta': 10000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': False,
'hidden_act': 'silu',
"tokenizer": "llama"
},
'llama_70bflash': {
'vocab_size': 32000,
'hidden_size': 8192,
'intermediate_size': 28672,
'num_hidden_layers': 80,
'num_attention_heads': 64,
'num_key_value_heads': 8,
'max_sequence_length': 8192,
'max_position_embeddings': 8192,
'rope_theta': 10000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': False,
'scan_attention': True,
'scan_mlp': True,
'hidden_act': 'silu',
"tokenizer": "llama"
},
'llama3_8b': {
'vocab_size': 128256,
'hidden_size': 4096,
'intermediate_size': 14336,
'num_hidden_layers': 32,
'num_attention_heads': 32,
'num_key_value_heads': 8,
'max_sequence_length': 8192,
'max_position_embeddings': 8192,
'rope_theta': 500000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "hf-meta-llama/Meta-Llama-3-8B",
},
'llama3_70b': {
'vocab_size': 128256,
'hidden_size': 8192,
'intermediate_size': 28672,
'num_hidden_layers': 80,
'num_attention_heads': 64,
'num_key_value_heads': 8,
'max_sequence_length': 8192,
'max_position_embeddings': 8192,
'rope_theta': 500000.0,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "hf-meta-llama/Meta-Llama-3-70B",
},
'open_llama_3b': {
'vocab_size': 32000,
'hidden_size': 3200,
'intermediate_size': 8640,
'num_hidden_layers': 26,
'num_attention_heads': 32,
'max_sequence_length': 2048,
'initializer_range': 0.02,
'rms_norm_eps': 1e-6,
'max_position_embeddings': 2048,
'num_key_value_heads': 32,
'rope_theta': 10000.0,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "llama"
},
'gemma_2b': {
'vocab_size': 256000,
'hidden_size': 2048,
'intermediate_size': 16384,
'num_hidden_layers': 18,
'num_attention_heads': 8,
'max_sequence_length': 8192,
'initializer_range': 0.02,
'rms_norm_eps': 1e-6,
'max_position_embeddings': 8192,
'num_key_value_heads': 1,
'rope_theta': 10000.0,
'tie_word_embeddings': True,
'normalize_input_embeds': True,
'norm_module': 'GemmaRMSNorm',
'hidden_act': 'gelu',
"tokenizer": "gemma"
},
'gemma_7b': {
'vocab_size': 256000,
'hidden_size': 3072,
'intermediate_size': 24576,
'num_hidden_layers': 28,
'num_attention_heads': 16,
'max_sequence_length': 8192,
'initializer_range': 0.02,
'rms_norm_eps': 1e-6,
'max_position_embeddings': 8192,
'num_key_value_heads': 16,
'rope_theta': 10000.0,
'tie_word_embeddings': True,
'normalize_input_embeds': True,
'norm_module': 'GemmaRMSNorm',
'hidden_act': 'gelu',
"tokenizer": "gemma"
},
'tiny_llama_1b': {
'vocab_size': 32000,
'hidden_size': 2048,
'intermediate_size': 5632,
'num_hidden_layers': 22,
'num_attention_heads': 32,
'max_sequence_length': 2048,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'max_position_embeddings': 2048,
'num_key_value_heads': 4,
'rope_theta': 10000.0,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "llama"
},
'debug': { # A small model for debugging
'vocab_size': 32000,
'hidden_size': 512,
'intermediate_size': 512,
'num_hidden_layers': 1,
'num_attention_heads': 8,
'max_sequence_length': 4096,
'initializer_range': 0.02,
'rms_norm_eps': 1e-5,
'max_position_embeddings': 4096,
'num_key_value_heads': 8,
'rope_theta': 10000.0,
'tie_word_embeddings': False,
'hidden_act': 'silu',
'norm_module': 'RMSNorm',
"tokenizer": "llama"
},
'gemma2_9b': {
'vocab_size': 256000,
'hidden_size': 3584,
'head_dim': 256,
'intermediate_size': 14336,
'num_hidden_layers': 42,
'num_attention_heads': 16,
'max_sequence_length': 8192,
"query_pre_attn_scalar": 224,
'initializer_range': 0.02,
'rms_norm_eps': 1e-6,
'max_position_embeddings': 8192,
'num_key_value_heads': 8,
'rope_theta': 10000.0,
'tie_word_embeddings': False,
'normalize_input_embeds': True,
'norm_module': 'GemmaRMSNorm',
'hidden_act': 'gelu_tanh',
"tokenizer": "hf-google/gemma-2-9b",
"attn_logit_softcapping": 50.0,
"final_logit_softcapping": 30.0,
},
'gemma2_27b': {
'vocab_size': 256000,
'hidden_size': 4608,
'head_dim': 128,
'intermediate_size': 36864,
'num_hidden_layers': 46,
'num_attention_heads': 32,
'max_sequence_length': 8192,
"query_pre_attn_scalar": 144,
'initializer_range': 0.02,
'rms_norm_eps': 1e-6,
'max_position_embeddings': 8192,
'num_key_value_heads': 16,
'rope_theta': 10000.0,
'tie_word_embeddings': False,
'normalize_input_embeds': True,
'norm_module': 'GemmaRMSNorm',
'hidden_act': 'gelu_tanh',
"tokenizer": "hf-google/gemma-2-27b",
"attn_logit_softcapping": 50.0,
"final_logit_softcapping": 30.0,
},
}