{ "trainer": { "trainer_class": "TrainerTopK", "dict_class": "AutoEncoderTopK", "lr": 0.00016329931618554522, "steps": 30000, "seed": 42, "activation_dim": 18432, "dict_size": 24576, "k": 64, "device": "cuda:0", "wandb_name": "all_modules_topk64_dict24576_seed42_batch8196", "submodule_list": "[GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n)]" }, "buffer": { "d_submodule": 768, "io": "out", "n_ctxs": 128, "ctx_len": 128, "refresh_batch_size": 256, "out_batch_size": 8192, "device": "cuda:0" } }