|
DEFAULT_IMAGE_PATCH_TOKEN = f"<im_patch>" |
|
DEFAULT_IM_START_TOKEN = f"<im_start>" |
|
DEFAULT_IM_END_TOKEN = f"<im_end>" |
|
DEFAULT_IM_COL_TOKEN = f"<im_col>" |
|
IMAGE_PROMPT = "<|image|>" |
|
|
|
EXTRA_TOKENS = (DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_COL_TOKEN, IMAGE_PROMPT) |
|
|
|
|
|
VIT_STANDARD_CONFIGS = { |
|
"dinov2-large": { |
|
"image_emb_dim": 1024, |
|
"image_mlp_dim": 4096, |
|
'image_patch_size': 14, |
|
'image_pos_patch_size': 14, |
|
'image_num_layers': 24, |
|
'image_num_heads': 16, |
|
'image_num_key_value_heads': 16, |
|
'image_head_dim': 64, |
|
'image_mlp_activations': 'gelu', |
|
'image_default_input_size': (224, 224), |
|
'image_num_pos': 257, |
|
'image_norm_eps': 1e-6, |
|
"image_model_type": "dino" |
|
}, |
|
"SigLIP-So400m-14-384": { |
|
"image_emb_dim": 1152, |
|
'image_num_layers': 27, |
|
"image_mlp_dim": 4304, |
|
'image_patch_size': 14, |
|
'image_pos_patch_size': 14, |
|
'image_num_heads': 16, |
|
'image_num_key_value_heads': 16, |
|
'image_head_dim': 72, |
|
'image_mlp_activations': 'gelu', |
|
|
|
|
|
'image_default_input_size': (378, 378), |
|
'image_num_pos': 729, |
|
'image_norm_eps': 1e-6, |
|
"image_model_type": "siglip", |
|
"resize_mode": "siglip" |
|
}, |
|
"DFN5B-CLIP-ViT-H-14-378": { |
|
"image_emb_dim": 1280, |
|
'image_patch_size': 14, |
|
'image_pos_patch_size': 14, |
|
'image_num_layers': 32, |
|
'image_num_heads': 16, |
|
'image_num_key_value_heads': 16, |
|
'image_head_dim': 80, |
|
'image_mlp_dim': 5120, |
|
'image_dropout_rate': 0.0, |
|
'image_mlp_activations': 'quick_gelu', |
|
'image_default_input_size': (378, 378), |
|
'image_num_pos': 730, |
|
'image_norm_eps': 1e-5, |
|
"image_model_type": "openai", |
|
"resize_mode": "no_aspect_ratio" |
|
}, |
|
'ViT-L/14-336': { |
|
'image_patch_size': 14, |
|
'image_pos_patch_size': 14, |
|
'image_emb_dim': 1024, |
|
'image_num_heads': 16, |
|
'image_num_layers': 23, |
|
'image_head_dim': 64, |
|
'image_mlp_dim': 4096, |
|
'image_mlp_activations': 'quick_gelu', |
|
'image_dropout_rate': 0.0, |
|
'image_num_pos': 577, |
|
'image_default_input_size': (336, 336), |
|
'image_norm_eps': 1e-5, |
|
'image_num_key_value_heads': 16, |
|
"image_model_type": "openai" |
|
}, |
|
'EVA02-L-14-336': { |
|
'image_patch_size': 14, |
|
'image_pos_patch_size': 14, |
|
'image_emb_dim': 1024, |
|
'image_num_heads': 16, |
|
'image_num_layers': 24, |
|
'image_head_dim': 64, |
|
'image_mlp_dim': 2730, |
|
'image_mlp_activations': 'silu', |
|
'image_dropout_rate': 0.0, |
|
'image_num_pos': 577, |
|
'image_default_input_size': (336, 336), |
|
'image_norm_eps': 1e-6, |
|
'image_num_key_value_heads': 16, |
|
"image_model_type": "eva" |
|
}, |
|
'ViT-L/14': { |
|
'image_patch_size': 14, |
|
'image_pos_patch_size': 14, |
|
'image_emb_dim': 1024, |
|
'image_num_heads': 16, |
|
|
|
'image_num_layers': 23, |
|
'image_head_dim': 64, |
|
'image_mlp_dim': 4096, |
|
'image_mlp_activations': 'quick_gelu', |
|
'image_dropout_rate': 0.0, |
|
'image_num_pos': 257, |
|
'image_default_input_size': (224, 224), |
|
'image_norm_eps': 1e-5, |
|
'image_num_key_value_heads': 16, |
|
"image_model_type": "openai" |
|
}, |
|
'debug': { |
|
'image_patch_size': 14, |
|
'image_pos_patch_size': 14, |
|
'image_emb_dim': 1024, |
|
'image_num_heads': 16, |
|
'image_num_layers': 1, |
|
'image_head_dim': 64, |
|
'image_mlp_dim': 1024, |
|
'image_mlp_activations': 'quick_gelu', |
|
'image_dropout_rate': 0.0, |
|
'image_num_pos': 577, |
|
'image_default_input_size': (336, 336), |
|
'image_norm_eps': 1e-5, |
|
'image_num_key_value_heads': 16, |
|
"image_model_type": "openai" |
|
} |
|
} |
|
|
|
OPEN_LLM_STANDARD_CONFIGS = { |
|
"qwen1.5_7b": { |
|
'vocab_size': 151936, |
|
'hidden_size': 4096, |
|
'intermediate_size': 11008, |
|
'num_hidden_layers': 32, |
|
'num_attention_heads': 32, |
|
'num_key_value_heads': 32, |
|
'max_sequence_length': 2048, |
|
'max_position_embeddings': 32768, |
|
'rope_theta': 1000000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-6, |
|
"qkv_bias": True, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "hf-Qwen/Qwen1.5-7B", |
|
}, |
|
"qwen1.5_14b": { |
|
'vocab_size': 152064, |
|
'hidden_size': 5120, |
|
'intermediate_size': 13696, |
|
'num_hidden_layers': 40, |
|
'num_attention_heads': 40, |
|
'num_key_value_heads': 40, |
|
'max_sequence_length': 2048, |
|
'max_position_embeddings': 32768, |
|
'rope_theta': 1000000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-6, |
|
"qkv_bias": True, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "hf-Qwen/Qwen1.5-14B", |
|
}, |
|
"qwen1.5_32b": { |
|
"vocab_size": 152064, |
|
"hidden_size": 5120, |
|
"intermediate_size": 27392, |
|
"num_hidden_layers": 64, |
|
"num_attention_heads": 40, |
|
"num_key_value_heads": 8, |
|
'max_sequence_length': 2048, |
|
'max_position_embeddings': 32768, |
|
"rope_theta": 1000000.0, |
|
'initializer_range': 0.02, |
|
"rms_norm_eps": 1e-6, |
|
"qkv_bias": True, |
|
"tie_word_embeddings": False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "hf-Qwen/Qwen1.5-32B", |
|
}, |
|
'llama_7b': { |
|
'vocab_size': 32000, |
|
'hidden_size': 4096, |
|
'intermediate_size': 11008, |
|
'num_hidden_layers': 32, |
|
'num_attention_heads': 32, |
|
'num_key_value_heads': 32, |
|
'max_sequence_length': 2048, |
|
'max_position_embeddings': 8192, |
|
'rope_theta': 10000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "llama" |
|
}, |
|
'yi_6b': { |
|
'vocab_size': 64000, |
|
'hidden_size': 4096, |
|
'intermediate_size': 11008, |
|
'num_hidden_layers': 32, |
|
'num_attention_heads': 32, |
|
'num_key_value_heads': 4, |
|
'max_sequence_length': 4096, |
|
'max_position_embeddings': 4096, |
|
'rope_theta': 5000000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "yi" |
|
}, |
|
'yi_9b': { |
|
'vocab_size': 64000, |
|
'hidden_size': 4096, |
|
'intermediate_size': 11008, |
|
'num_hidden_layers': 48, |
|
'num_attention_heads': 32, |
|
'num_key_value_heads': 4, |
|
'max_sequence_length': 4096, |
|
'max_position_embeddings': 4096, |
|
'rope_theta': 10000, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-06, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "yi" |
|
}, |
|
'yi_34b': { |
|
'vocab_size': 64000, |
|
'hidden_size': 7168, |
|
'intermediate_size': 20480, |
|
'num_hidden_layers': 60, |
|
'num_attention_heads': 56, |
|
'num_key_value_heads': 8, |
|
'max_sequence_length': 4096, |
|
'max_position_embeddings': 4096, |
|
'rope_theta': 5000000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "yi" |
|
}, |
|
"olmo_1b": { |
|
'vocab_size': 50304, |
|
'hidden_size': 2048, |
|
'intermediate_size': 8192, |
|
'num_hidden_layers': 16, |
|
'num_attention_heads': 16, |
|
'num_key_value_heads': 16, |
|
'max_sequence_length': 4096, |
|
'max_position_embeddings': 32768, |
|
'rope_theta': 10000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': True, |
|
'hidden_act': 'silu', |
|
'norm_module': 'OlmoLayerNorm', |
|
"tokenizer": "hf-allenai/OLMo-1B" |
|
}, |
|
"olmo_7b": { |
|
'vocab_size': 50304, |
|
'hidden_size': 4096, |
|
'intermediate_size': 22016//2, |
|
'num_hidden_layers': 32, |
|
'num_attention_heads': 32, |
|
'num_key_value_heads': 32, |
|
'max_sequence_length': 4096, |
|
'max_position_embeddings': 32768, |
|
'rope_theta': 10000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'OlmoLayerNorm', |
|
"tokenizer": "hf-allenai/OLMo-7B", |
|
}, |
|
"olmo_1.7_7b": { |
|
'vocab_size': 50304, |
|
'hidden_size': 4096, |
|
'intermediate_size': 22016//2, |
|
'num_hidden_layers': 32, |
|
'num_attention_heads': 32, |
|
'num_key_value_heads': 32, |
|
'max_sequence_length': 4096, |
|
'max_position_embeddings': 32768, |
|
'rope_theta': 10000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
"qkv_clip": 8, |
|
'norm_module': 'OlmoLayerNorm', |
|
"tokenizer": "hf-allenai/OLMo-1.7-7B", |
|
}, |
|
'mistral_7b': { |
|
'vocab_size': 32000, |
|
'hidden_size': 4096, |
|
'intermediate_size': 14336, |
|
'num_hidden_layers': 32, |
|
'num_attention_heads': 32, |
|
'num_key_value_heads': 8, |
|
'max_sequence_length': 4096, |
|
'max_position_embeddings': 32768, |
|
'rope_theta': 10000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "mistral" |
|
}, |
|
'mistral0.3_7b': { |
|
'vocab_size': 32768, |
|
'hidden_size': 4096, |
|
'intermediate_size': 14336, |
|
'num_hidden_layers': 32, |
|
'num_attention_heads': 32, |
|
'num_key_value_heads': 8, |
|
'max_sequence_length': 4096, |
|
'max_position_embeddings': 32768, |
|
'rope_theta': 1000000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "mistral0.3" |
|
}, |
|
"mistral0.2_22b": { |
|
'vocab_size': 32000, |
|
'hidden_size': 6144, |
|
'intermediate_size': 16384, |
|
'num_hidden_layers': 56, |
|
'num_attention_heads': 48, |
|
'num_key_value_heads': 8, |
|
'max_sequence_length': 4096, |
|
'max_position_embeddings': 32768, |
|
'rope_theta': 1000000, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "mistral" |
|
}, |
|
'llama_13b': { |
|
'vocab_size': 32000, |
|
'hidden_size': 5120, |
|
'intermediate_size': 13824, |
|
'num_hidden_layers': 40, |
|
'num_attention_heads': 40, |
|
'num_key_value_heads': 40, |
|
'max_sequence_length': 2048, |
|
'max_position_embeddings': 8192, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
"norm_module": 'RMSNorm', |
|
'rope_theta': 10000.0, |
|
"tokenizer": "llama" |
|
}, |
|
'llama_70b': { |
|
'vocab_size': 32000, |
|
'hidden_size': 8192, |
|
'intermediate_size': 28672, |
|
'num_hidden_layers': 80, |
|
'num_attention_heads': 64, |
|
'num_key_value_heads': 8, |
|
'max_sequence_length': 8192, |
|
'max_position_embeddings': 8192, |
|
'rope_theta': 10000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
"tokenizer": "llama" |
|
}, |
|
'llama_70bflash': { |
|
'vocab_size': 32000, |
|
'hidden_size': 8192, |
|
'intermediate_size': 28672, |
|
'num_hidden_layers': 80, |
|
'num_attention_heads': 64, |
|
'num_key_value_heads': 8, |
|
'max_sequence_length': 8192, |
|
'max_position_embeddings': 8192, |
|
'rope_theta': 10000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': False, |
|
'scan_attention': True, |
|
'scan_mlp': True, |
|
'hidden_act': 'silu', |
|
"tokenizer": "llama" |
|
}, |
|
'llama3_8b': { |
|
'vocab_size': 128256, |
|
'hidden_size': 4096, |
|
'intermediate_size': 14336, |
|
'num_hidden_layers': 32, |
|
'num_attention_heads': 32, |
|
'num_key_value_heads': 8, |
|
'max_sequence_length': 8192, |
|
'max_position_embeddings': 8192, |
|
'rope_theta': 500000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "hf-meta-llama/Meta-Llama-3-8B", |
|
|
|
}, |
|
'llama3_70b': { |
|
'vocab_size': 128256, |
|
'hidden_size': 8192, |
|
'intermediate_size': 28672, |
|
'num_hidden_layers': 80, |
|
'num_attention_heads': 64, |
|
'num_key_value_heads': 8, |
|
'max_sequence_length': 8192, |
|
'max_position_embeddings': 8192, |
|
'rope_theta': 500000.0, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "hf-meta-llama/Meta-Llama-3-70B", |
|
}, |
|
'open_llama_3b': { |
|
'vocab_size': 32000, |
|
'hidden_size': 3200, |
|
'intermediate_size': 8640, |
|
'num_hidden_layers': 26, |
|
'num_attention_heads': 32, |
|
'max_sequence_length': 2048, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-6, |
|
'max_position_embeddings': 2048, |
|
'num_key_value_heads': 32, |
|
'rope_theta': 10000.0, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "llama" |
|
}, |
|
'gemma_2b': { |
|
'vocab_size': 256000, |
|
'hidden_size': 2048, |
|
'intermediate_size': 16384, |
|
'num_hidden_layers': 18, |
|
'num_attention_heads': 8, |
|
'max_sequence_length': 8192, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-6, |
|
'max_position_embeddings': 8192, |
|
'num_key_value_heads': 1, |
|
'rope_theta': 10000.0, |
|
'tie_word_embeddings': True, |
|
'normalize_input_embeds': True, |
|
'norm_module': 'GemmaRMSNorm', |
|
'hidden_act': 'gelu', |
|
"tokenizer": "gemma" |
|
}, |
|
'gemma_7b': { |
|
'vocab_size': 256000, |
|
'hidden_size': 3072, |
|
'intermediate_size': 24576, |
|
'num_hidden_layers': 28, |
|
'num_attention_heads': 16, |
|
'max_sequence_length': 8192, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-6, |
|
'max_position_embeddings': 8192, |
|
'num_key_value_heads': 16, |
|
'rope_theta': 10000.0, |
|
'tie_word_embeddings': True, |
|
'normalize_input_embeds': True, |
|
'norm_module': 'GemmaRMSNorm', |
|
'hidden_act': 'gelu', |
|
"tokenizer": "gemma" |
|
}, |
|
'tiny_llama_1b': { |
|
'vocab_size': 32000, |
|
'hidden_size': 2048, |
|
'intermediate_size': 5632, |
|
'num_hidden_layers': 22, |
|
'num_attention_heads': 32, |
|
'max_sequence_length': 2048, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'max_position_embeddings': 2048, |
|
'num_key_value_heads': 4, |
|
'rope_theta': 10000.0, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "llama" |
|
}, |
|
'debug': { |
|
'vocab_size': 32000, |
|
'hidden_size': 512, |
|
'intermediate_size': 512, |
|
'num_hidden_layers': 1, |
|
'num_attention_heads': 8, |
|
'max_sequence_length': 4096, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-5, |
|
'max_position_embeddings': 4096, |
|
'num_key_value_heads': 8, |
|
'rope_theta': 10000.0, |
|
'tie_word_embeddings': False, |
|
'hidden_act': 'silu', |
|
'norm_module': 'RMSNorm', |
|
"tokenizer": "llama" |
|
}, |
|
'gemma2_9b': { |
|
'vocab_size': 256000, |
|
'hidden_size': 3584, |
|
'head_dim': 256, |
|
'intermediate_size': 14336, |
|
'num_hidden_layers': 42, |
|
'num_attention_heads': 16, |
|
'max_sequence_length': 8192, |
|
"query_pre_attn_scalar": 224, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-6, |
|
'max_position_embeddings': 8192, |
|
'num_key_value_heads': 8, |
|
'rope_theta': 10000.0, |
|
'tie_word_embeddings': False, |
|
'normalize_input_embeds': True, |
|
'norm_module': 'GemmaRMSNorm', |
|
'hidden_act': 'gelu_tanh', |
|
"tokenizer": "hf-google/gemma-2-9b", |
|
"attn_logit_softcapping": 50.0, |
|
"final_logit_softcapping": 30.0, |
|
}, |
|
'gemma2_27b': { |
|
'vocab_size': 256000, |
|
'hidden_size': 4608, |
|
'head_dim': 128, |
|
'intermediate_size': 36864, |
|
'num_hidden_layers': 46, |
|
'num_attention_heads': 32, |
|
'max_sequence_length': 8192, |
|
"query_pre_attn_scalar": 144, |
|
'initializer_range': 0.02, |
|
'rms_norm_eps': 1e-6, |
|
'max_position_embeddings': 8192, |
|
'num_key_value_heads': 16, |
|
'rope_theta': 10000.0, |
|
'tie_word_embeddings': False, |
|
'normalize_input_embeds': True, |
|
'norm_module': 'GemmaRMSNorm', |
|
'hidden_act': 'gelu_tanh', |
|
"tokenizer": "hf-google/gemma-2-27b", |
|
"attn_logit_softcapping": 50.0, |
|
"final_logit_softcapping": 30.0, |
|
}, |
|
} |