{ "architectures": [ "LlamaForCausalLMWithGNN" ], "attention_bias": false, "attention_dropout": 0.0, "bos_token_id": 128000, "eos_token_id": [ 128001, 128008, 128009 ], "gnn_config": { "GIN_after_attention": true, "GIN_after_attention_pre_GIN_norm": true, "GIN_after_attention_skip": true, "GIN_edge_weight_scaling": true, "GIN_hidden_dim_multiplier": 1, "GIN_use_MLP": true, "GIN_use_norm": false, "LlamaAttentionHierarchicalPerceiverAR_use_rope": true, "LlamaAttentionHierarchicalVariant_2_PerceiverAR_use_skip": true, "MLP_type": "standard_MLP", "N_GNN_from_attention_layers": 3, "activation": "prelu", "add_rope": false, "adj_construction_method": "sum", "adj_transform_hidden_dim": 128, "attention_GIN_MLP_GIN_MLP_mode": "shared", "attention_GIN_MLP_GIN_MLP_pre_aggregate": true, "attention_GIN_MLP_GIN_binary_scale": 1.0, "attention_GIN_MLP_GIN_fuse_mode": "epsilon", "attention_GIN_MLP_GIN_learnable_threshold": false, "attention_GIN_MLP_GIN_mode": "default", "attention_GIN_MLP_GIN_sharp_softplus_beta": 10.0, "attention_GIN_MLP_GIN_softmax_temperature": 1.0, "attention_GIN_MLP_GIN_threshold_mode": "none", "attention_GIN_MLP_GIN_threshold_value": 0.2, "attention_GIN_MLP_GIN_top_k_fraction_of_sequence_length": 0.1, "attention_GIN_MLP_GIN_use_ReLU_instead_of_softmax": true, "attention_GIN_MLP_GIN_use_softmax": false, "attention_GIN_MLP_attention_mix_mode": "A", "attention_GIN_MLP_multiplier": 2, "attention_GIN_MLP_o_proj_at_end": false, "attention_GIN_MLP_scoring_hidden_dim": 512, "attention_GIN_MLP_second_order_factor": 0.1, "attention_GIN_MLP_separate_attention": false, "attention_GIN_MLP_use_scoring_fnct": true, "attention_GIN_MLP_use_second_order": false, "attention_epsilon_strategy": "default", "attention_epsilon_uniform_value": 0.5, "combined_norm": false, "continuous_transform_alpha": 10.0, "distance_scaling_method": "power", "distance_weight_strength": 1.0, "dropout": 0.1, "enforce_causality": true, "epsilon_threshold": 0.6, "gnn_logic": "before_MLP", "gnn_mode": "single", "gnn_residual": false, "gnn_type": "causal_gin", "group_tokens_for_coarse_graining": false, "hidden_dim": 155, "hierarchical_enc_dec_type": "PerceiverAR", "initial_sharpening_value": 1.0, "lambda_GNN": 0.5, "lambda_GNN_initial": 0.0, "learnable_aggregate_activation": "softmax", "max_position_embeddings": 2048, "mix_weights_initial": 0.5, "model_type": "", "norm_to_hidden_states": false, "num_latent_layers": 4, "num_latents": 32, "num_latents_list": [ 64, 32, 8 ], "num_layers": 1, "per_head_ff": false, "plot_for_debugging": false, "remove_self_connections": false, "residual_epsilon_strategy": "default", "residual_epsilon_uniform_value": 0.1, "rms_norm_eps": 1e-05, "sharpening_value_init": "value", "soft_masking_initial_threshold": 0.01, "soft_masking_k": 10.0, "threshold": 0.1, "threshold_any_tau": 0.1, "tokenizer": null, "top_k": 8, "use_GNN_from_attention": "none", "use_GNN_from_attention_add_RoPE_at_every_layer": false, "use_differential_attention": false, "use_differential_attention_group_norm": false, "use_distance_scaling": false, "use_fixed_number_of_tokens_per_latent": false, "use_graph_property_modulation": false, "use_graph_property_modulation_with_norm": false, "use_graph_property_modulation_with_norm_use_causal_clustering": true, "use_hierarchical_attention": false, "use_layer_norm": true, "use_layer_norm_in_GIN_MLP": false, "use_no_norm_in_GIN_MLP": false, "use_original_hidden_states": false, "use_original_hidden_states_add_attention": false, "use_projection": true, "use_sharpening": false, "use_soft_masking": false, "zero_below_epsilon_threshold": true }, "head_dim": 128, "hidden_act": "silu", "hidden_size": 3072, "initializer_range": 0.02, "intermediate_size": 8192, "max_position_embeddings": 131072, "mlp_bias": false, "model_type": "llama", "num_attention_heads": 24, "num_hidden_layers": 28, "num_key_value_heads": 8, "pretraining_tp": 1, "rms_norm_eps": 1e-05, "rope_scaling": { "factor": 32.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3" }, "rope_theta": 500000.0, "tie_word_embeddings": true, "torch_dtype": "float32", "transformers_version": "4.46.1", "use_cache": false, "vocab_size": 128256 }