masani commited on
Commit
e94968b
·
verified ·
1 Parent(s): 11df375

End of training

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/run-20250402_145246-e1n3xkh6/run-e1n3xkh6.wandb filter=lfs diff=lfs merge=lfs -text
.hydra/config.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: ${oc.env:PROJECT_ROOT}/logs/sft_pretrain_and_pushtohub
2
+ generate_prompt: src.utils.return_generate_prompt
3
+ wandb_config:
4
+ name: ${model.model_name}
5
+ project: sft_on_${task.task_name}
6
+ dir: ${log_dir}/${task.task_name}-${model.model_name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
7
+ notes: null
8
+ trainer_args:
9
+ _target_: trl.SFTConfig
10
+ per_device_train_batch_size: 8
11
+ per_device_eval_batch_size: 16
12
+ num_train_epochs: 10
13
+ logging_dir: ${log_dir}/${task.task_name}-${model.model_name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
14
+ push_to_hub: false
15
+ save_strategy: epoch
16
+ evaluation_strategy: epoch
17
+ batch_eval_metrics: true
18
+ do_train: true,
19
+ do_eval: true,
20
+ output_dir: ${.logging_dir}
21
+ report_to: wandb
22
+ logging_steps: 10
23
+ task:
24
+ dataset:
25
+ _target_: datasets.load_dataset
26
+ path: openai/gsm8k
27
+ name: main
28
+ prompt_key: question
29
+ target_key: answer
30
+ default_prompt: "1. Always present the final answer on the last line of your response\
31
+ \ in the format: #### <answer> Ensure that the answer is a single number. \n 2.\
32
+ \ End each sentence with a newline character ('\\n'). \n 3. Perform any calculations\
33
+ \ within a <<...>> block before outputing the result of this calculation."
34
+ extract_answer_from_dataset: src.task.gsm8k.ExtractAnswerFromDataset
35
+ task_name: gsm8k
36
+ reward_class:
37
+ _target_: src.task.gsm8k.GSM8KReward
38
+ LOG_FILE: ${trainer.args.output_dir}/completions.json
39
+ format_reward_function: src.task.gsm8k.FormatRewardFunction
40
+ model:
41
+ model_name_or_path: openai-community/gpt2-xl
42
+ model_name: gpt2-xl
43
+ model_config:
44
+ _target_: trl.ModelConfig
45
+ use_peft: false
46
+ dataset_wrapper:
47
+ _target_: src.utils.CurriculumDatasetWrapper
.hydra/hydra.yaml ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${log_dir}/${task.task_name}-${model.model_name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
4
+ sweep:
5
+ dir: ${log_dir}/${task.task_name}-${model.model_name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - model=gpt2xl_1.5b
116
+ - task=gsm8k
117
+ job:
118
+ name: sft_pretrain_and_pushtohub
119
+ chdir: null
120
+ override_dirname: model=gpt2xl_1.5b,task=gsm8k
121
+ id: ???
122
+ num: ???
123
+ config_name: sft_train_and_pushtohub.yaml
124
+ env_set: {}
125
+ env_copy: []
126
+ config:
127
+ override_dirname:
128
+ kv_sep: '='
129
+ item_sep: ','
130
+ exclude_keys: []
131
+ runtime:
132
+ version: 1.3.2
133
+ version_base: '1.3'
134
+ cwd: /mnt/dlabscratch1/amani/LLM-RL
135
+ config_sources:
136
+ - path: hydra.conf
137
+ schema: pkg
138
+ provider: hydra
139
+ - path: /mnt/dlabscratch1/amani/LLM-RL/config
140
+ schema: file
141
+ provider: main
142
+ - path: ''
143
+ schema: structured
144
+ provider: schema
145
+ output_dir: /mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39
146
+ choices:
147
+ dataset_wrapper: default
148
+ model: gpt2xl_1.5b
149
+ task: gsm8k
150
+ hydra/env: default
151
+ hydra/callbacks: null
152
+ hydra/job_logging: default
153
+ hydra/hydra_logging: default
154
+ hydra/hydra_help: default
155
+ hydra/help: default
156
+ hydra/sweeper: basic
157
+ hydra/launcher: basic
158
+ hydra/output: default
159
+ verbose: false
.hydra/overrides.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ - model=gpt2xl_1.5b
2
+ - task=gsm8k
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: openai-community/gpt2-xl
3
+ library_name: transformers
4
+ model_name: 'gpt2-xl-gsm8k-epoch1-acc0-1. Always '
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - sft
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for gpt2-xl-gsm8k-epoch1-acc0-1. Always
13
+
14
+ This model is a fine-tuned version of [openai-community/gpt2-xl](https://huggingface.co/openai-community/gpt2-xl).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="masani/2025-04-02_14-52-39", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/epfl-dlab/sft_on_gsm8k/runs/e1n3xkh6)
31
+
32
+
33
+ This model was trained with SFT.
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.15.2
38
+ - Transformers: 4.49.0
39
+ - Pytorch: 2.5.1
40
+ - Datasets: 3.1.0
41
+ - Tokenizers: 0.21.1
42
+
43
+ ## Citations
44
+
45
+
46
+
47
+ Cite TRL as:
48
+
49
+ ```bibtex
50
+ @misc{vonwerra2022trl,
51
+ title = {{TRL: Transformer Reinforcement Learning}},
52
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
53
+ year = 2020,
54
+ journal = {GitHub repository},
55
+ publisher = {GitHub},
56
+ howpublished = {\url{https://github.com/huggingface/trl}}
57
+ }
58
+ ```
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai-community/gpt2-xl",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 1600,
16
+ "n_head": 25,
17
+ "n_inner": null,
18
+ "n_layer": 48,
19
+ "n_positions": 1024,
20
+ "output_past": true,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50
34
+ }
35
+ },
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.49.0",
38
+ "use_cache": true,
39
+ "vocab_size": 50257
40
+ }
epoch1/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai-community/gpt2-xl",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 1600,
16
+ "n_head": 25,
17
+ "n_inner": null,
18
+ "n_layer": 48,
19
+ "n_positions": 1024,
20
+ "output_past": true,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50
34
+ }
35
+ },
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.49.0",
38
+ "use_cache": true,
39
+ "vocab_size": 50257
40
+ }
epoch1/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.49.0"
6
+ }
epoch1/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
epoch1/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45eaf1bc5dcfd0b4839330d6f467f69caab9ee368654c5c6222c59406e7cd79a
3
+ size 4959881464
epoch1/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0918a2b866bcf0560ffd7519aed3b03d19320f777c8b7792408b6e5d65c5da2b
3
+ size 1270624096
epoch1/model.safetensors.index.json ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 6230444800
4
+ },
5
+ "weight_map": {
6
+ "transformer.h.0.attn.c_attn.bias": "model-00001-of-00002.safetensors",
7
+ "transformer.h.0.attn.c_attn.weight": "model-00001-of-00002.safetensors",
8
+ "transformer.h.0.attn.c_proj.bias": "model-00001-of-00002.safetensors",
9
+ "transformer.h.0.attn.c_proj.weight": "model-00001-of-00002.safetensors",
10
+ "transformer.h.0.ln_1.bias": "model-00001-of-00002.safetensors",
11
+ "transformer.h.0.ln_1.weight": "model-00001-of-00002.safetensors",
12
+ "transformer.h.0.ln_2.bias": "model-00001-of-00002.safetensors",
13
+ "transformer.h.0.ln_2.weight": "model-00001-of-00002.safetensors",
14
+ "transformer.h.0.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
15
+ "transformer.h.0.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
16
+ "transformer.h.0.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
17
+ "transformer.h.0.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
18
+ "transformer.h.1.attn.c_attn.bias": "model-00001-of-00002.safetensors",
19
+ "transformer.h.1.attn.c_attn.weight": "model-00001-of-00002.safetensors",
20
+ "transformer.h.1.attn.c_proj.bias": "model-00001-of-00002.safetensors",
21
+ "transformer.h.1.attn.c_proj.weight": "model-00001-of-00002.safetensors",
22
+ "transformer.h.1.ln_1.bias": "model-00001-of-00002.safetensors",
23
+ "transformer.h.1.ln_1.weight": "model-00001-of-00002.safetensors",
24
+ "transformer.h.1.ln_2.bias": "model-00001-of-00002.safetensors",
25
+ "transformer.h.1.ln_2.weight": "model-00001-of-00002.safetensors",
26
+ "transformer.h.1.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
27
+ "transformer.h.1.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
28
+ "transformer.h.1.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
29
+ "transformer.h.1.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
30
+ "transformer.h.10.attn.c_attn.bias": "model-00001-of-00002.safetensors",
31
+ "transformer.h.10.attn.c_attn.weight": "model-00001-of-00002.safetensors",
32
+ "transformer.h.10.attn.c_proj.bias": "model-00001-of-00002.safetensors",
33
+ "transformer.h.10.attn.c_proj.weight": "model-00001-of-00002.safetensors",
34
+ "transformer.h.10.ln_1.bias": "model-00001-of-00002.safetensors",
35
+ "transformer.h.10.ln_1.weight": "model-00001-of-00002.safetensors",
36
+ "transformer.h.10.ln_2.bias": "model-00001-of-00002.safetensors",
37
+ "transformer.h.10.ln_2.weight": "model-00001-of-00002.safetensors",
38
+ "transformer.h.10.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
39
+ "transformer.h.10.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
40
+ "transformer.h.10.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
41
+ "transformer.h.10.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
42
+ "transformer.h.11.attn.c_attn.bias": "model-00001-of-00002.safetensors",
43
+ "transformer.h.11.attn.c_attn.weight": "model-00001-of-00002.safetensors",
44
+ "transformer.h.11.attn.c_proj.bias": "model-00001-of-00002.safetensors",
45
+ "transformer.h.11.attn.c_proj.weight": "model-00001-of-00002.safetensors",
46
+ "transformer.h.11.ln_1.bias": "model-00001-of-00002.safetensors",
47
+ "transformer.h.11.ln_1.weight": "model-00001-of-00002.safetensors",
48
+ "transformer.h.11.ln_2.bias": "model-00001-of-00002.safetensors",
49
+ "transformer.h.11.ln_2.weight": "model-00001-of-00002.safetensors",
50
+ "transformer.h.11.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
51
+ "transformer.h.11.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
52
+ "transformer.h.11.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
53
+ "transformer.h.11.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
54
+ "transformer.h.12.attn.c_attn.bias": "model-00001-of-00002.safetensors",
55
+ "transformer.h.12.attn.c_attn.weight": "model-00001-of-00002.safetensors",
56
+ "transformer.h.12.attn.c_proj.bias": "model-00001-of-00002.safetensors",
57
+ "transformer.h.12.attn.c_proj.weight": "model-00001-of-00002.safetensors",
58
+ "transformer.h.12.ln_1.bias": "model-00001-of-00002.safetensors",
59
+ "transformer.h.12.ln_1.weight": "model-00001-of-00002.safetensors",
60
+ "transformer.h.12.ln_2.bias": "model-00001-of-00002.safetensors",
61
+ "transformer.h.12.ln_2.weight": "model-00001-of-00002.safetensors",
62
+ "transformer.h.12.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
63
+ "transformer.h.12.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
64
+ "transformer.h.12.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
65
+ "transformer.h.12.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
66
+ "transformer.h.13.attn.c_attn.bias": "model-00001-of-00002.safetensors",
67
+ "transformer.h.13.attn.c_attn.weight": "model-00001-of-00002.safetensors",
68
+ "transformer.h.13.attn.c_proj.bias": "model-00001-of-00002.safetensors",
69
+ "transformer.h.13.attn.c_proj.weight": "model-00001-of-00002.safetensors",
70
+ "transformer.h.13.ln_1.bias": "model-00001-of-00002.safetensors",
71
+ "transformer.h.13.ln_1.weight": "model-00001-of-00002.safetensors",
72
+ "transformer.h.13.ln_2.bias": "model-00001-of-00002.safetensors",
73
+ "transformer.h.13.ln_2.weight": "model-00001-of-00002.safetensors",
74
+ "transformer.h.13.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
75
+ "transformer.h.13.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
76
+ "transformer.h.13.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
77
+ "transformer.h.13.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
78
+ "transformer.h.14.attn.c_attn.bias": "model-00001-of-00002.safetensors",
79
+ "transformer.h.14.attn.c_attn.weight": "model-00001-of-00002.safetensors",
80
+ "transformer.h.14.attn.c_proj.bias": "model-00001-of-00002.safetensors",
81
+ "transformer.h.14.attn.c_proj.weight": "model-00001-of-00002.safetensors",
82
+ "transformer.h.14.ln_1.bias": "model-00001-of-00002.safetensors",
83
+ "transformer.h.14.ln_1.weight": "model-00001-of-00002.safetensors",
84
+ "transformer.h.14.ln_2.bias": "model-00001-of-00002.safetensors",
85
+ "transformer.h.14.ln_2.weight": "model-00001-of-00002.safetensors",
86
+ "transformer.h.14.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
87
+ "transformer.h.14.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
88
+ "transformer.h.14.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
89
+ "transformer.h.14.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
90
+ "transformer.h.15.attn.c_attn.bias": "model-00001-of-00002.safetensors",
91
+ "transformer.h.15.attn.c_attn.weight": "model-00001-of-00002.safetensors",
92
+ "transformer.h.15.attn.c_proj.bias": "model-00001-of-00002.safetensors",
93
+ "transformer.h.15.attn.c_proj.weight": "model-00001-of-00002.safetensors",
94
+ "transformer.h.15.ln_1.bias": "model-00001-of-00002.safetensors",
95
+ "transformer.h.15.ln_1.weight": "model-00001-of-00002.safetensors",
96
+ "transformer.h.15.ln_2.bias": "model-00001-of-00002.safetensors",
97
+ "transformer.h.15.ln_2.weight": "model-00001-of-00002.safetensors",
98
+ "transformer.h.15.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
99
+ "transformer.h.15.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
100
+ "transformer.h.15.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
101
+ "transformer.h.15.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
102
+ "transformer.h.16.attn.c_attn.bias": "model-00001-of-00002.safetensors",
103
+ "transformer.h.16.attn.c_attn.weight": "model-00001-of-00002.safetensors",
104
+ "transformer.h.16.attn.c_proj.bias": "model-00001-of-00002.safetensors",
105
+ "transformer.h.16.attn.c_proj.weight": "model-00001-of-00002.safetensors",
106
+ "transformer.h.16.ln_1.bias": "model-00001-of-00002.safetensors",
107
+ "transformer.h.16.ln_1.weight": "model-00001-of-00002.safetensors",
108
+ "transformer.h.16.ln_2.bias": "model-00001-of-00002.safetensors",
109
+ "transformer.h.16.ln_2.weight": "model-00001-of-00002.safetensors",
110
+ "transformer.h.16.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
111
+ "transformer.h.16.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
112
+ "transformer.h.16.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
113
+ "transformer.h.16.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
114
+ "transformer.h.17.attn.c_attn.bias": "model-00001-of-00002.safetensors",
115
+ "transformer.h.17.attn.c_attn.weight": "model-00001-of-00002.safetensors",
116
+ "transformer.h.17.attn.c_proj.bias": "model-00001-of-00002.safetensors",
117
+ "transformer.h.17.attn.c_proj.weight": "model-00001-of-00002.safetensors",
118
+ "transformer.h.17.ln_1.bias": "model-00001-of-00002.safetensors",
119
+ "transformer.h.17.ln_1.weight": "model-00001-of-00002.safetensors",
120
+ "transformer.h.17.ln_2.bias": "model-00001-of-00002.safetensors",
121
+ "transformer.h.17.ln_2.weight": "model-00001-of-00002.safetensors",
122
+ "transformer.h.17.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
123
+ "transformer.h.17.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
124
+ "transformer.h.17.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
125
+ "transformer.h.17.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
126
+ "transformer.h.18.attn.c_attn.bias": "model-00001-of-00002.safetensors",
127
+ "transformer.h.18.attn.c_attn.weight": "model-00001-of-00002.safetensors",
128
+ "transformer.h.18.attn.c_proj.bias": "model-00001-of-00002.safetensors",
129
+ "transformer.h.18.attn.c_proj.weight": "model-00001-of-00002.safetensors",
130
+ "transformer.h.18.ln_1.bias": "model-00001-of-00002.safetensors",
131
+ "transformer.h.18.ln_1.weight": "model-00001-of-00002.safetensors",
132
+ "transformer.h.18.ln_2.bias": "model-00001-of-00002.safetensors",
133
+ "transformer.h.18.ln_2.weight": "model-00001-of-00002.safetensors",
134
+ "transformer.h.18.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
135
+ "transformer.h.18.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
136
+ "transformer.h.18.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
137
+ "transformer.h.18.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
138
+ "transformer.h.19.attn.c_attn.bias": "model-00001-of-00002.safetensors",
139
+ "transformer.h.19.attn.c_attn.weight": "model-00001-of-00002.safetensors",
140
+ "transformer.h.19.attn.c_proj.bias": "model-00001-of-00002.safetensors",
141
+ "transformer.h.19.attn.c_proj.weight": "model-00001-of-00002.safetensors",
142
+ "transformer.h.19.ln_1.bias": "model-00001-of-00002.safetensors",
143
+ "transformer.h.19.ln_1.weight": "model-00001-of-00002.safetensors",
144
+ "transformer.h.19.ln_2.bias": "model-00001-of-00002.safetensors",
145
+ "transformer.h.19.ln_2.weight": "model-00001-of-00002.safetensors",
146
+ "transformer.h.19.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
147
+ "transformer.h.19.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
148
+ "transformer.h.19.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
149
+ "transformer.h.19.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
150
+ "transformer.h.2.attn.c_attn.bias": "model-00001-of-00002.safetensors",
151
+ "transformer.h.2.attn.c_attn.weight": "model-00001-of-00002.safetensors",
152
+ "transformer.h.2.attn.c_proj.bias": "model-00001-of-00002.safetensors",
153
+ "transformer.h.2.attn.c_proj.weight": "model-00001-of-00002.safetensors",
154
+ "transformer.h.2.ln_1.bias": "model-00001-of-00002.safetensors",
155
+ "transformer.h.2.ln_1.weight": "model-00001-of-00002.safetensors",
156
+ "transformer.h.2.ln_2.bias": "model-00001-of-00002.safetensors",
157
+ "transformer.h.2.ln_2.weight": "model-00001-of-00002.safetensors",
158
+ "transformer.h.2.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
159
+ "transformer.h.2.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
160
+ "transformer.h.2.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
161
+ "transformer.h.2.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
162
+ "transformer.h.20.attn.c_attn.bias": "model-00001-of-00002.safetensors",
163
+ "transformer.h.20.attn.c_attn.weight": "model-00001-of-00002.safetensors",
164
+ "transformer.h.20.attn.c_proj.bias": "model-00001-of-00002.safetensors",
165
+ "transformer.h.20.attn.c_proj.weight": "model-00001-of-00002.safetensors",
166
+ "transformer.h.20.ln_1.bias": "model-00001-of-00002.safetensors",
167
+ "transformer.h.20.ln_1.weight": "model-00001-of-00002.safetensors",
168
+ "transformer.h.20.ln_2.bias": "model-00001-of-00002.safetensors",
169
+ "transformer.h.20.ln_2.weight": "model-00001-of-00002.safetensors",
170
+ "transformer.h.20.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
171
+ "transformer.h.20.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
172
+ "transformer.h.20.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
173
+ "transformer.h.20.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
174
+ "transformer.h.21.attn.c_attn.bias": "model-00001-of-00002.safetensors",
175
+ "transformer.h.21.attn.c_attn.weight": "model-00001-of-00002.safetensors",
176
+ "transformer.h.21.attn.c_proj.bias": "model-00001-of-00002.safetensors",
177
+ "transformer.h.21.attn.c_proj.weight": "model-00001-of-00002.safetensors",
178
+ "transformer.h.21.ln_1.bias": "model-00001-of-00002.safetensors",
179
+ "transformer.h.21.ln_1.weight": "model-00001-of-00002.safetensors",
180
+ "transformer.h.21.ln_2.bias": "model-00001-of-00002.safetensors",
181
+ "transformer.h.21.ln_2.weight": "model-00001-of-00002.safetensors",
182
+ "transformer.h.21.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
183
+ "transformer.h.21.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
184
+ "transformer.h.21.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
185
+ "transformer.h.21.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
186
+ "transformer.h.22.attn.c_attn.bias": "model-00001-of-00002.safetensors",
187
+ "transformer.h.22.attn.c_attn.weight": "model-00001-of-00002.safetensors",
188
+ "transformer.h.22.attn.c_proj.bias": "model-00001-of-00002.safetensors",
189
+ "transformer.h.22.attn.c_proj.weight": "model-00001-of-00002.safetensors",
190
+ "transformer.h.22.ln_1.bias": "model-00001-of-00002.safetensors",
191
+ "transformer.h.22.ln_1.weight": "model-00001-of-00002.safetensors",
192
+ "transformer.h.22.ln_2.bias": "model-00001-of-00002.safetensors",
193
+ "transformer.h.22.ln_2.weight": "model-00001-of-00002.safetensors",
194
+ "transformer.h.22.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
195
+ "transformer.h.22.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
196
+ "transformer.h.22.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
197
+ "transformer.h.22.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
198
+ "transformer.h.23.attn.c_attn.bias": "model-00001-of-00002.safetensors",
199
+ "transformer.h.23.attn.c_attn.weight": "model-00001-of-00002.safetensors",
200
+ "transformer.h.23.attn.c_proj.bias": "model-00001-of-00002.safetensors",
201
+ "transformer.h.23.attn.c_proj.weight": "model-00001-of-00002.safetensors",
202
+ "transformer.h.23.ln_1.bias": "model-00001-of-00002.safetensors",
203
+ "transformer.h.23.ln_1.weight": "model-00001-of-00002.safetensors",
204
+ "transformer.h.23.ln_2.bias": "model-00001-of-00002.safetensors",
205
+ "transformer.h.23.ln_2.weight": "model-00001-of-00002.safetensors",
206
+ "transformer.h.23.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
207
+ "transformer.h.23.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
208
+ "transformer.h.23.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
209
+ "transformer.h.23.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
210
+ "transformer.h.24.attn.c_attn.bias": "model-00001-of-00002.safetensors",
211
+ "transformer.h.24.attn.c_attn.weight": "model-00001-of-00002.safetensors",
212
+ "transformer.h.24.attn.c_proj.bias": "model-00001-of-00002.safetensors",
213
+ "transformer.h.24.attn.c_proj.weight": "model-00001-of-00002.safetensors",
214
+ "transformer.h.24.ln_1.bias": "model-00001-of-00002.safetensors",
215
+ "transformer.h.24.ln_1.weight": "model-00001-of-00002.safetensors",
216
+ "transformer.h.24.ln_2.bias": "model-00001-of-00002.safetensors",
217
+ "transformer.h.24.ln_2.weight": "model-00001-of-00002.safetensors",
218
+ "transformer.h.24.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
219
+ "transformer.h.24.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
220
+ "transformer.h.24.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
221
+ "transformer.h.24.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
222
+ "transformer.h.25.attn.c_attn.bias": "model-00001-of-00002.safetensors",
223
+ "transformer.h.25.attn.c_attn.weight": "model-00001-of-00002.safetensors",
224
+ "transformer.h.25.attn.c_proj.bias": "model-00001-of-00002.safetensors",
225
+ "transformer.h.25.attn.c_proj.weight": "model-00001-of-00002.safetensors",
226
+ "transformer.h.25.ln_1.bias": "model-00001-of-00002.safetensors",
227
+ "transformer.h.25.ln_1.weight": "model-00001-of-00002.safetensors",
228
+ "transformer.h.25.ln_2.bias": "model-00001-of-00002.safetensors",
229
+ "transformer.h.25.ln_2.weight": "model-00001-of-00002.safetensors",
230
+ "transformer.h.25.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
231
+ "transformer.h.25.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
232
+ "transformer.h.25.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
233
+ "transformer.h.25.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
234
+ "transformer.h.26.attn.c_attn.bias": "model-00001-of-00002.safetensors",
235
+ "transformer.h.26.attn.c_attn.weight": "model-00001-of-00002.safetensors",
236
+ "transformer.h.26.attn.c_proj.bias": "model-00001-of-00002.safetensors",
237
+ "transformer.h.26.attn.c_proj.weight": "model-00001-of-00002.safetensors",
238
+ "transformer.h.26.ln_1.bias": "model-00001-of-00002.safetensors",
239
+ "transformer.h.26.ln_1.weight": "model-00001-of-00002.safetensors",
240
+ "transformer.h.26.ln_2.bias": "model-00001-of-00002.safetensors",
241
+ "transformer.h.26.ln_2.weight": "model-00001-of-00002.safetensors",
242
+ "transformer.h.26.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
243
+ "transformer.h.26.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
244
+ "transformer.h.26.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
245
+ "transformer.h.26.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
246
+ "transformer.h.27.attn.c_attn.bias": "model-00001-of-00002.safetensors",
247
+ "transformer.h.27.attn.c_attn.weight": "model-00001-of-00002.safetensors",
248
+ "transformer.h.27.attn.c_proj.bias": "model-00001-of-00002.safetensors",
249
+ "transformer.h.27.attn.c_proj.weight": "model-00001-of-00002.safetensors",
250
+ "transformer.h.27.ln_1.bias": "model-00001-of-00002.safetensors",
251
+ "transformer.h.27.ln_1.weight": "model-00001-of-00002.safetensors",
252
+ "transformer.h.27.ln_2.bias": "model-00001-of-00002.safetensors",
253
+ "transformer.h.27.ln_2.weight": "model-00001-of-00002.safetensors",
254
+ "transformer.h.27.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
255
+ "transformer.h.27.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
256
+ "transformer.h.27.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
257
+ "transformer.h.27.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
258
+ "transformer.h.28.attn.c_attn.bias": "model-00001-of-00002.safetensors",
259
+ "transformer.h.28.attn.c_attn.weight": "model-00001-of-00002.safetensors",
260
+ "transformer.h.28.attn.c_proj.bias": "model-00001-of-00002.safetensors",
261
+ "transformer.h.28.attn.c_proj.weight": "model-00001-of-00002.safetensors",
262
+ "transformer.h.28.ln_1.bias": "model-00001-of-00002.safetensors",
263
+ "transformer.h.28.ln_1.weight": "model-00001-of-00002.safetensors",
264
+ "transformer.h.28.ln_2.bias": "model-00001-of-00002.safetensors",
265
+ "transformer.h.28.ln_2.weight": "model-00001-of-00002.safetensors",
266
+ "transformer.h.28.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
267
+ "transformer.h.28.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
268
+ "transformer.h.28.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
269
+ "transformer.h.28.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
270
+ "transformer.h.29.attn.c_attn.bias": "model-00001-of-00002.safetensors",
271
+ "transformer.h.29.attn.c_attn.weight": "model-00001-of-00002.safetensors",
272
+ "transformer.h.29.attn.c_proj.bias": "model-00001-of-00002.safetensors",
273
+ "transformer.h.29.attn.c_proj.weight": "model-00001-of-00002.safetensors",
274
+ "transformer.h.29.ln_1.bias": "model-00001-of-00002.safetensors",
275
+ "transformer.h.29.ln_1.weight": "model-00001-of-00002.safetensors",
276
+ "transformer.h.29.ln_2.bias": "model-00001-of-00002.safetensors",
277
+ "transformer.h.29.ln_2.weight": "model-00001-of-00002.safetensors",
278
+ "transformer.h.29.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
279
+ "transformer.h.29.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
280
+ "transformer.h.29.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
281
+ "transformer.h.29.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
282
+ "transformer.h.3.attn.c_attn.bias": "model-00001-of-00002.safetensors",
283
+ "transformer.h.3.attn.c_attn.weight": "model-00001-of-00002.safetensors",
284
+ "transformer.h.3.attn.c_proj.bias": "model-00001-of-00002.safetensors",
285
+ "transformer.h.3.attn.c_proj.weight": "model-00001-of-00002.safetensors",
286
+ "transformer.h.3.ln_1.bias": "model-00001-of-00002.safetensors",
287
+ "transformer.h.3.ln_1.weight": "model-00001-of-00002.safetensors",
288
+ "transformer.h.3.ln_2.bias": "model-00001-of-00002.safetensors",
289
+ "transformer.h.3.ln_2.weight": "model-00001-of-00002.safetensors",
290
+ "transformer.h.3.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
291
+ "transformer.h.3.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
292
+ "transformer.h.3.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
293
+ "transformer.h.3.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
294
+ "transformer.h.30.attn.c_attn.bias": "model-00001-of-00002.safetensors",
295
+ "transformer.h.30.attn.c_attn.weight": "model-00001-of-00002.safetensors",
296
+ "transformer.h.30.attn.c_proj.bias": "model-00001-of-00002.safetensors",
297
+ "transformer.h.30.attn.c_proj.weight": "model-00001-of-00002.safetensors",
298
+ "transformer.h.30.ln_1.bias": "model-00001-of-00002.safetensors",
299
+ "transformer.h.30.ln_1.weight": "model-00001-of-00002.safetensors",
300
+ "transformer.h.30.ln_2.bias": "model-00001-of-00002.safetensors",
301
+ "transformer.h.30.ln_2.weight": "model-00001-of-00002.safetensors",
302
+ "transformer.h.30.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
303
+ "transformer.h.30.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
304
+ "transformer.h.30.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
305
+ "transformer.h.30.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
306
+ "transformer.h.31.attn.c_attn.bias": "model-00001-of-00002.safetensors",
307
+ "transformer.h.31.attn.c_attn.weight": "model-00001-of-00002.safetensors",
308
+ "transformer.h.31.attn.c_proj.bias": "model-00001-of-00002.safetensors",
309
+ "transformer.h.31.attn.c_proj.weight": "model-00001-of-00002.safetensors",
310
+ "transformer.h.31.ln_1.bias": "model-00001-of-00002.safetensors",
311
+ "transformer.h.31.ln_1.weight": "model-00001-of-00002.safetensors",
312
+ "transformer.h.31.ln_2.bias": "model-00001-of-00002.safetensors",
313
+ "transformer.h.31.ln_2.weight": "model-00001-of-00002.safetensors",
314
+ "transformer.h.31.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
315
+ "transformer.h.31.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
316
+ "transformer.h.31.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
317
+ "transformer.h.31.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
318
+ "transformer.h.32.attn.c_attn.bias": "model-00001-of-00002.safetensors",
319
+ "transformer.h.32.attn.c_attn.weight": "model-00001-of-00002.safetensors",
320
+ "transformer.h.32.attn.c_proj.bias": "model-00001-of-00002.safetensors",
321
+ "transformer.h.32.attn.c_proj.weight": "model-00001-of-00002.safetensors",
322
+ "transformer.h.32.ln_1.bias": "model-00001-of-00002.safetensors",
323
+ "transformer.h.32.ln_1.weight": "model-00001-of-00002.safetensors",
324
+ "transformer.h.32.ln_2.bias": "model-00001-of-00002.safetensors",
325
+ "transformer.h.32.ln_2.weight": "model-00001-of-00002.safetensors",
326
+ "transformer.h.32.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
327
+ "transformer.h.32.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
328
+ "transformer.h.32.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
329
+ "transformer.h.32.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
330
+ "transformer.h.33.attn.c_attn.bias": "model-00001-of-00002.safetensors",
331
+ "transformer.h.33.attn.c_attn.weight": "model-00001-of-00002.safetensors",
332
+ "transformer.h.33.attn.c_proj.bias": "model-00001-of-00002.safetensors",
333
+ "transformer.h.33.attn.c_proj.weight": "model-00001-of-00002.safetensors",
334
+ "transformer.h.33.ln_1.bias": "model-00001-of-00002.safetensors",
335
+ "transformer.h.33.ln_1.weight": "model-00001-of-00002.safetensors",
336
+ "transformer.h.33.ln_2.bias": "model-00001-of-00002.safetensors",
337
+ "transformer.h.33.ln_2.weight": "model-00001-of-00002.safetensors",
338
+ "transformer.h.33.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
339
+ "transformer.h.33.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
340
+ "transformer.h.33.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
341
+ "transformer.h.33.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
342
+ "transformer.h.34.attn.c_attn.bias": "model-00001-of-00002.safetensors",
343
+ "transformer.h.34.attn.c_attn.weight": "model-00001-of-00002.safetensors",
344
+ "transformer.h.34.attn.c_proj.bias": "model-00001-of-00002.safetensors",
345
+ "transformer.h.34.attn.c_proj.weight": "model-00001-of-00002.safetensors",
346
+ "transformer.h.34.ln_1.bias": "model-00001-of-00002.safetensors",
347
+ "transformer.h.34.ln_1.weight": "model-00001-of-00002.safetensors",
348
+ "transformer.h.34.ln_2.bias": "model-00001-of-00002.safetensors",
349
+ "transformer.h.34.ln_2.weight": "model-00001-of-00002.safetensors",
350
+ "transformer.h.34.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
351
+ "transformer.h.34.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
352
+ "transformer.h.34.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
353
+ "transformer.h.34.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
354
+ "transformer.h.35.attn.c_attn.bias": "model-00001-of-00002.safetensors",
355
+ "transformer.h.35.attn.c_attn.weight": "model-00001-of-00002.safetensors",
356
+ "transformer.h.35.attn.c_proj.bias": "model-00001-of-00002.safetensors",
357
+ "transformer.h.35.attn.c_proj.weight": "model-00001-of-00002.safetensors",
358
+ "transformer.h.35.ln_1.bias": "model-00001-of-00002.safetensors",
359
+ "transformer.h.35.ln_1.weight": "model-00001-of-00002.safetensors",
360
+ "transformer.h.35.ln_2.bias": "model-00001-of-00002.safetensors",
361
+ "transformer.h.35.ln_2.weight": "model-00001-of-00002.safetensors",
362
+ "transformer.h.35.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
363
+ "transformer.h.35.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
364
+ "transformer.h.35.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
365
+ "transformer.h.35.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
366
+ "transformer.h.36.attn.c_attn.bias": "model-00001-of-00002.safetensors",
367
+ "transformer.h.36.attn.c_attn.weight": "model-00001-of-00002.safetensors",
368
+ "transformer.h.36.attn.c_proj.bias": "model-00001-of-00002.safetensors",
369
+ "transformer.h.36.attn.c_proj.weight": "model-00001-of-00002.safetensors",
370
+ "transformer.h.36.ln_1.bias": "model-00001-of-00002.safetensors",
371
+ "transformer.h.36.ln_1.weight": "model-00001-of-00002.safetensors",
372
+ "transformer.h.36.ln_2.bias": "model-00001-of-00002.safetensors",
373
+ "transformer.h.36.ln_2.weight": "model-00001-of-00002.safetensors",
374
+ "transformer.h.36.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
375
+ "transformer.h.36.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
376
+ "transformer.h.36.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
377
+ "transformer.h.36.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
378
+ "transformer.h.37.attn.c_attn.bias": "model-00001-of-00002.safetensors",
379
+ "transformer.h.37.attn.c_attn.weight": "model-00001-of-00002.safetensors",
380
+ "transformer.h.37.attn.c_proj.bias": "model-00001-of-00002.safetensors",
381
+ "transformer.h.37.attn.c_proj.weight": "model-00001-of-00002.safetensors",
382
+ "transformer.h.37.ln_1.bias": "model-00001-of-00002.safetensors",
383
+ "transformer.h.37.ln_1.weight": "model-00001-of-00002.safetensors",
384
+ "transformer.h.37.ln_2.bias": "model-00001-of-00002.safetensors",
385
+ "transformer.h.37.ln_2.weight": "model-00001-of-00002.safetensors",
386
+ "transformer.h.37.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
387
+ "transformer.h.37.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
388
+ "transformer.h.37.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
389
+ "transformer.h.37.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
390
+ "transformer.h.38.attn.c_attn.bias": "model-00002-of-00002.safetensors",
391
+ "transformer.h.38.attn.c_attn.weight": "model-00002-of-00002.safetensors",
392
+ "transformer.h.38.attn.c_proj.bias": "model-00002-of-00002.safetensors",
393
+ "transformer.h.38.attn.c_proj.weight": "model-00002-of-00002.safetensors",
394
+ "transformer.h.38.ln_1.bias": "model-00002-of-00002.safetensors",
395
+ "transformer.h.38.ln_1.weight": "model-00002-of-00002.safetensors",
396
+ "transformer.h.38.ln_2.bias": "model-00002-of-00002.safetensors",
397
+ "transformer.h.38.ln_2.weight": "model-00002-of-00002.safetensors",
398
+ "transformer.h.38.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
399
+ "transformer.h.38.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
400
+ "transformer.h.38.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
401
+ "transformer.h.38.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
402
+ "transformer.h.39.attn.c_attn.bias": "model-00002-of-00002.safetensors",
403
+ "transformer.h.39.attn.c_attn.weight": "model-00002-of-00002.safetensors",
404
+ "transformer.h.39.attn.c_proj.bias": "model-00002-of-00002.safetensors",
405
+ "transformer.h.39.attn.c_proj.weight": "model-00002-of-00002.safetensors",
406
+ "transformer.h.39.ln_1.bias": "model-00002-of-00002.safetensors",
407
+ "transformer.h.39.ln_1.weight": "model-00002-of-00002.safetensors",
408
+ "transformer.h.39.ln_2.bias": "model-00002-of-00002.safetensors",
409
+ "transformer.h.39.ln_2.weight": "model-00002-of-00002.safetensors",
410
+ "transformer.h.39.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
411
+ "transformer.h.39.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
412
+ "transformer.h.39.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
413
+ "transformer.h.39.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
414
+ "transformer.h.4.attn.c_attn.bias": "model-00001-of-00002.safetensors",
415
+ "transformer.h.4.attn.c_attn.weight": "model-00001-of-00002.safetensors",
416
+ "transformer.h.4.attn.c_proj.bias": "model-00001-of-00002.safetensors",
417
+ "transformer.h.4.attn.c_proj.weight": "model-00001-of-00002.safetensors",
418
+ "transformer.h.4.ln_1.bias": "model-00001-of-00002.safetensors",
419
+ "transformer.h.4.ln_1.weight": "model-00001-of-00002.safetensors",
420
+ "transformer.h.4.ln_2.bias": "model-00001-of-00002.safetensors",
421
+ "transformer.h.4.ln_2.weight": "model-00001-of-00002.safetensors",
422
+ "transformer.h.4.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
423
+ "transformer.h.4.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
424
+ "transformer.h.4.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
425
+ "transformer.h.4.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
426
+ "transformer.h.40.attn.c_attn.bias": "model-00002-of-00002.safetensors",
427
+ "transformer.h.40.attn.c_attn.weight": "model-00002-of-00002.safetensors",
428
+ "transformer.h.40.attn.c_proj.bias": "model-00002-of-00002.safetensors",
429
+ "transformer.h.40.attn.c_proj.weight": "model-00002-of-00002.safetensors",
430
+ "transformer.h.40.ln_1.bias": "model-00002-of-00002.safetensors",
431
+ "transformer.h.40.ln_1.weight": "model-00002-of-00002.safetensors",
432
+ "transformer.h.40.ln_2.bias": "model-00002-of-00002.safetensors",
433
+ "transformer.h.40.ln_2.weight": "model-00002-of-00002.safetensors",
434
+ "transformer.h.40.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
435
+ "transformer.h.40.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
436
+ "transformer.h.40.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
437
+ "transformer.h.40.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
438
+ "transformer.h.41.attn.c_attn.bias": "model-00002-of-00002.safetensors",
439
+ "transformer.h.41.attn.c_attn.weight": "model-00002-of-00002.safetensors",
440
+ "transformer.h.41.attn.c_proj.bias": "model-00002-of-00002.safetensors",
441
+ "transformer.h.41.attn.c_proj.weight": "model-00002-of-00002.safetensors",
442
+ "transformer.h.41.ln_1.bias": "model-00002-of-00002.safetensors",
443
+ "transformer.h.41.ln_1.weight": "model-00002-of-00002.safetensors",
444
+ "transformer.h.41.ln_2.bias": "model-00002-of-00002.safetensors",
445
+ "transformer.h.41.ln_2.weight": "model-00002-of-00002.safetensors",
446
+ "transformer.h.41.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
447
+ "transformer.h.41.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
448
+ "transformer.h.41.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
449
+ "transformer.h.41.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
450
+ "transformer.h.42.attn.c_attn.bias": "model-00002-of-00002.safetensors",
451
+ "transformer.h.42.attn.c_attn.weight": "model-00002-of-00002.safetensors",
452
+ "transformer.h.42.attn.c_proj.bias": "model-00002-of-00002.safetensors",
453
+ "transformer.h.42.attn.c_proj.weight": "model-00002-of-00002.safetensors",
454
+ "transformer.h.42.ln_1.bias": "model-00002-of-00002.safetensors",
455
+ "transformer.h.42.ln_1.weight": "model-00002-of-00002.safetensors",
456
+ "transformer.h.42.ln_2.bias": "model-00002-of-00002.safetensors",
457
+ "transformer.h.42.ln_2.weight": "model-00002-of-00002.safetensors",
458
+ "transformer.h.42.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
459
+ "transformer.h.42.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
460
+ "transformer.h.42.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
461
+ "transformer.h.42.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
462
+ "transformer.h.43.attn.c_attn.bias": "model-00002-of-00002.safetensors",
463
+ "transformer.h.43.attn.c_attn.weight": "model-00002-of-00002.safetensors",
464
+ "transformer.h.43.attn.c_proj.bias": "model-00002-of-00002.safetensors",
465
+ "transformer.h.43.attn.c_proj.weight": "model-00002-of-00002.safetensors",
466
+ "transformer.h.43.ln_1.bias": "model-00002-of-00002.safetensors",
467
+ "transformer.h.43.ln_1.weight": "model-00002-of-00002.safetensors",
468
+ "transformer.h.43.ln_2.bias": "model-00002-of-00002.safetensors",
469
+ "transformer.h.43.ln_2.weight": "model-00002-of-00002.safetensors",
470
+ "transformer.h.43.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
471
+ "transformer.h.43.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
472
+ "transformer.h.43.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
473
+ "transformer.h.43.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
474
+ "transformer.h.44.attn.c_attn.bias": "model-00002-of-00002.safetensors",
475
+ "transformer.h.44.attn.c_attn.weight": "model-00002-of-00002.safetensors",
476
+ "transformer.h.44.attn.c_proj.bias": "model-00002-of-00002.safetensors",
477
+ "transformer.h.44.attn.c_proj.weight": "model-00002-of-00002.safetensors",
478
+ "transformer.h.44.ln_1.bias": "model-00002-of-00002.safetensors",
479
+ "transformer.h.44.ln_1.weight": "model-00002-of-00002.safetensors",
480
+ "transformer.h.44.ln_2.bias": "model-00002-of-00002.safetensors",
481
+ "transformer.h.44.ln_2.weight": "model-00002-of-00002.safetensors",
482
+ "transformer.h.44.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
483
+ "transformer.h.44.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
484
+ "transformer.h.44.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
485
+ "transformer.h.44.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
486
+ "transformer.h.45.attn.c_attn.bias": "model-00002-of-00002.safetensors",
487
+ "transformer.h.45.attn.c_attn.weight": "model-00002-of-00002.safetensors",
488
+ "transformer.h.45.attn.c_proj.bias": "model-00002-of-00002.safetensors",
489
+ "transformer.h.45.attn.c_proj.weight": "model-00002-of-00002.safetensors",
490
+ "transformer.h.45.ln_1.bias": "model-00002-of-00002.safetensors",
491
+ "transformer.h.45.ln_1.weight": "model-00002-of-00002.safetensors",
492
+ "transformer.h.45.ln_2.bias": "model-00002-of-00002.safetensors",
493
+ "transformer.h.45.ln_2.weight": "model-00002-of-00002.safetensors",
494
+ "transformer.h.45.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
495
+ "transformer.h.45.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
496
+ "transformer.h.45.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
497
+ "transformer.h.45.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
498
+ "transformer.h.46.attn.c_attn.bias": "model-00002-of-00002.safetensors",
499
+ "transformer.h.46.attn.c_attn.weight": "model-00002-of-00002.safetensors",
500
+ "transformer.h.46.attn.c_proj.bias": "model-00002-of-00002.safetensors",
501
+ "transformer.h.46.attn.c_proj.weight": "model-00002-of-00002.safetensors",
502
+ "transformer.h.46.ln_1.bias": "model-00002-of-00002.safetensors",
503
+ "transformer.h.46.ln_1.weight": "model-00002-of-00002.safetensors",
504
+ "transformer.h.46.ln_2.bias": "model-00002-of-00002.safetensors",
505
+ "transformer.h.46.ln_2.weight": "model-00002-of-00002.safetensors",
506
+ "transformer.h.46.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
507
+ "transformer.h.46.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
508
+ "transformer.h.46.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
509
+ "transformer.h.46.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
510
+ "transformer.h.47.attn.c_attn.bias": "model-00002-of-00002.safetensors",
511
+ "transformer.h.47.attn.c_attn.weight": "model-00002-of-00002.safetensors",
512
+ "transformer.h.47.attn.c_proj.bias": "model-00002-of-00002.safetensors",
513
+ "transformer.h.47.attn.c_proj.weight": "model-00002-of-00002.safetensors",
514
+ "transformer.h.47.ln_1.bias": "model-00002-of-00002.safetensors",
515
+ "transformer.h.47.ln_1.weight": "model-00002-of-00002.safetensors",
516
+ "transformer.h.47.ln_2.bias": "model-00002-of-00002.safetensors",
517
+ "transformer.h.47.ln_2.weight": "model-00002-of-00002.safetensors",
518
+ "transformer.h.47.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
519
+ "transformer.h.47.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
520
+ "transformer.h.47.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
521
+ "transformer.h.47.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
522
+ "transformer.h.5.attn.c_attn.bias": "model-00001-of-00002.safetensors",
523
+ "transformer.h.5.attn.c_attn.weight": "model-00001-of-00002.safetensors",
524
+ "transformer.h.5.attn.c_proj.bias": "model-00001-of-00002.safetensors",
525
+ "transformer.h.5.attn.c_proj.weight": "model-00001-of-00002.safetensors",
526
+ "transformer.h.5.ln_1.bias": "model-00001-of-00002.safetensors",
527
+ "transformer.h.5.ln_1.weight": "model-00001-of-00002.safetensors",
528
+ "transformer.h.5.ln_2.bias": "model-00001-of-00002.safetensors",
529
+ "transformer.h.5.ln_2.weight": "model-00001-of-00002.safetensors",
530
+ "transformer.h.5.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
531
+ "transformer.h.5.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
532
+ "transformer.h.5.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
533
+ "transformer.h.5.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
534
+ "transformer.h.6.attn.c_attn.bias": "model-00001-of-00002.safetensors",
535
+ "transformer.h.6.attn.c_attn.weight": "model-00001-of-00002.safetensors",
536
+ "transformer.h.6.attn.c_proj.bias": "model-00001-of-00002.safetensors",
537
+ "transformer.h.6.attn.c_proj.weight": "model-00001-of-00002.safetensors",
538
+ "transformer.h.6.ln_1.bias": "model-00001-of-00002.safetensors",
539
+ "transformer.h.6.ln_1.weight": "model-00001-of-00002.safetensors",
540
+ "transformer.h.6.ln_2.bias": "model-00001-of-00002.safetensors",
541
+ "transformer.h.6.ln_2.weight": "model-00001-of-00002.safetensors",
542
+ "transformer.h.6.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
543
+ "transformer.h.6.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
544
+ "transformer.h.6.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
545
+ "transformer.h.6.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
546
+ "transformer.h.7.attn.c_attn.bias": "model-00001-of-00002.safetensors",
547
+ "transformer.h.7.attn.c_attn.weight": "model-00001-of-00002.safetensors",
548
+ "transformer.h.7.attn.c_proj.bias": "model-00001-of-00002.safetensors",
549
+ "transformer.h.7.attn.c_proj.weight": "model-00001-of-00002.safetensors",
550
+ "transformer.h.7.ln_1.bias": "model-00001-of-00002.safetensors",
551
+ "transformer.h.7.ln_1.weight": "model-00001-of-00002.safetensors",
552
+ "transformer.h.7.ln_2.bias": "model-00001-of-00002.safetensors",
553
+ "transformer.h.7.ln_2.weight": "model-00001-of-00002.safetensors",
554
+ "transformer.h.7.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
555
+ "transformer.h.7.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
556
+ "transformer.h.7.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
557
+ "transformer.h.7.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
558
+ "transformer.h.8.attn.c_attn.bias": "model-00001-of-00002.safetensors",
559
+ "transformer.h.8.attn.c_attn.weight": "model-00001-of-00002.safetensors",
560
+ "transformer.h.8.attn.c_proj.bias": "model-00001-of-00002.safetensors",
561
+ "transformer.h.8.attn.c_proj.weight": "model-00001-of-00002.safetensors",
562
+ "transformer.h.8.ln_1.bias": "model-00001-of-00002.safetensors",
563
+ "transformer.h.8.ln_1.weight": "model-00001-of-00002.safetensors",
564
+ "transformer.h.8.ln_2.bias": "model-00001-of-00002.safetensors",
565
+ "transformer.h.8.ln_2.weight": "model-00001-of-00002.safetensors",
566
+ "transformer.h.8.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
567
+ "transformer.h.8.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
568
+ "transformer.h.8.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
569
+ "transformer.h.8.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
570
+ "transformer.h.9.attn.c_attn.bias": "model-00001-of-00002.safetensors",
571
+ "transformer.h.9.attn.c_attn.weight": "model-00001-of-00002.safetensors",
572
+ "transformer.h.9.attn.c_proj.bias": "model-00001-of-00002.safetensors",
573
+ "transformer.h.9.attn.c_proj.weight": "model-00001-of-00002.safetensors",
574
+ "transformer.h.9.ln_1.bias": "model-00001-of-00002.safetensors",
575
+ "transformer.h.9.ln_1.weight": "model-00001-of-00002.safetensors",
576
+ "transformer.h.9.ln_2.bias": "model-00001-of-00002.safetensors",
577
+ "transformer.h.9.ln_2.weight": "model-00001-of-00002.safetensors",
578
+ "transformer.h.9.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
579
+ "transformer.h.9.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
580
+ "transformer.h.9.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
581
+ "transformer.h.9.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
582
+ "transformer.ln_f.bias": "model-00002-of-00002.safetensors",
583
+ "transformer.ln_f.weight": "model-00002-of-00002.safetensors",
584
+ "transformer.wpe.weight": "model-00001-of-00002.safetensors",
585
+ "transformer.wte.weight": "model-00001-of-00002.safetensors"
586
+ }
587
+ }
epoch1/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
epoch1/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
epoch1/tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "padding_side": "left",
20
+ "tokenizer_class": "GPT2Tokenizer",
21
+ "unk_token": "<|endoftext|>"
22
+ }
epoch1/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f14f3c6d299bfb369a2106aab54c59a032e03c366e1d1fecdcf02f954b66a25b
3
+ size 5624
epoch1/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.49.0"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45eaf1bc5dcfd0b4839330d6f467f69caab9ee368654c5c6222c59406e7cd79a
3
+ size 4959881464
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0918a2b866bcf0560ffd7519aed3b03d19320f777c8b7792408b6e5d65c5da2b
3
+ size 1270624096
model.safetensors.index.json ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 6230444800
4
+ },
5
+ "weight_map": {
6
+ "transformer.h.0.attn.c_attn.bias": "model-00001-of-00002.safetensors",
7
+ "transformer.h.0.attn.c_attn.weight": "model-00001-of-00002.safetensors",
8
+ "transformer.h.0.attn.c_proj.bias": "model-00001-of-00002.safetensors",
9
+ "transformer.h.0.attn.c_proj.weight": "model-00001-of-00002.safetensors",
10
+ "transformer.h.0.ln_1.bias": "model-00001-of-00002.safetensors",
11
+ "transformer.h.0.ln_1.weight": "model-00001-of-00002.safetensors",
12
+ "transformer.h.0.ln_2.bias": "model-00001-of-00002.safetensors",
13
+ "transformer.h.0.ln_2.weight": "model-00001-of-00002.safetensors",
14
+ "transformer.h.0.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
15
+ "transformer.h.0.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
16
+ "transformer.h.0.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
17
+ "transformer.h.0.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
18
+ "transformer.h.1.attn.c_attn.bias": "model-00001-of-00002.safetensors",
19
+ "transformer.h.1.attn.c_attn.weight": "model-00001-of-00002.safetensors",
20
+ "transformer.h.1.attn.c_proj.bias": "model-00001-of-00002.safetensors",
21
+ "transformer.h.1.attn.c_proj.weight": "model-00001-of-00002.safetensors",
22
+ "transformer.h.1.ln_1.bias": "model-00001-of-00002.safetensors",
23
+ "transformer.h.1.ln_1.weight": "model-00001-of-00002.safetensors",
24
+ "transformer.h.1.ln_2.bias": "model-00001-of-00002.safetensors",
25
+ "transformer.h.1.ln_2.weight": "model-00001-of-00002.safetensors",
26
+ "transformer.h.1.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
27
+ "transformer.h.1.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
28
+ "transformer.h.1.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
29
+ "transformer.h.1.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
30
+ "transformer.h.10.attn.c_attn.bias": "model-00001-of-00002.safetensors",
31
+ "transformer.h.10.attn.c_attn.weight": "model-00001-of-00002.safetensors",
32
+ "transformer.h.10.attn.c_proj.bias": "model-00001-of-00002.safetensors",
33
+ "transformer.h.10.attn.c_proj.weight": "model-00001-of-00002.safetensors",
34
+ "transformer.h.10.ln_1.bias": "model-00001-of-00002.safetensors",
35
+ "transformer.h.10.ln_1.weight": "model-00001-of-00002.safetensors",
36
+ "transformer.h.10.ln_2.bias": "model-00001-of-00002.safetensors",
37
+ "transformer.h.10.ln_2.weight": "model-00001-of-00002.safetensors",
38
+ "transformer.h.10.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
39
+ "transformer.h.10.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
40
+ "transformer.h.10.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
41
+ "transformer.h.10.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
42
+ "transformer.h.11.attn.c_attn.bias": "model-00001-of-00002.safetensors",
43
+ "transformer.h.11.attn.c_attn.weight": "model-00001-of-00002.safetensors",
44
+ "transformer.h.11.attn.c_proj.bias": "model-00001-of-00002.safetensors",
45
+ "transformer.h.11.attn.c_proj.weight": "model-00001-of-00002.safetensors",
46
+ "transformer.h.11.ln_1.bias": "model-00001-of-00002.safetensors",
47
+ "transformer.h.11.ln_1.weight": "model-00001-of-00002.safetensors",
48
+ "transformer.h.11.ln_2.bias": "model-00001-of-00002.safetensors",
49
+ "transformer.h.11.ln_2.weight": "model-00001-of-00002.safetensors",
50
+ "transformer.h.11.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
51
+ "transformer.h.11.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
52
+ "transformer.h.11.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
53
+ "transformer.h.11.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
54
+ "transformer.h.12.attn.c_attn.bias": "model-00001-of-00002.safetensors",
55
+ "transformer.h.12.attn.c_attn.weight": "model-00001-of-00002.safetensors",
56
+ "transformer.h.12.attn.c_proj.bias": "model-00001-of-00002.safetensors",
57
+ "transformer.h.12.attn.c_proj.weight": "model-00001-of-00002.safetensors",
58
+ "transformer.h.12.ln_1.bias": "model-00001-of-00002.safetensors",
59
+ "transformer.h.12.ln_1.weight": "model-00001-of-00002.safetensors",
60
+ "transformer.h.12.ln_2.bias": "model-00001-of-00002.safetensors",
61
+ "transformer.h.12.ln_2.weight": "model-00001-of-00002.safetensors",
62
+ "transformer.h.12.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
63
+ "transformer.h.12.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
64
+ "transformer.h.12.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
65
+ "transformer.h.12.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
66
+ "transformer.h.13.attn.c_attn.bias": "model-00001-of-00002.safetensors",
67
+ "transformer.h.13.attn.c_attn.weight": "model-00001-of-00002.safetensors",
68
+ "transformer.h.13.attn.c_proj.bias": "model-00001-of-00002.safetensors",
69
+ "transformer.h.13.attn.c_proj.weight": "model-00001-of-00002.safetensors",
70
+ "transformer.h.13.ln_1.bias": "model-00001-of-00002.safetensors",
71
+ "transformer.h.13.ln_1.weight": "model-00001-of-00002.safetensors",
72
+ "transformer.h.13.ln_2.bias": "model-00001-of-00002.safetensors",
73
+ "transformer.h.13.ln_2.weight": "model-00001-of-00002.safetensors",
74
+ "transformer.h.13.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
75
+ "transformer.h.13.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
76
+ "transformer.h.13.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
77
+ "transformer.h.13.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
78
+ "transformer.h.14.attn.c_attn.bias": "model-00001-of-00002.safetensors",
79
+ "transformer.h.14.attn.c_attn.weight": "model-00001-of-00002.safetensors",
80
+ "transformer.h.14.attn.c_proj.bias": "model-00001-of-00002.safetensors",
81
+ "transformer.h.14.attn.c_proj.weight": "model-00001-of-00002.safetensors",
82
+ "transformer.h.14.ln_1.bias": "model-00001-of-00002.safetensors",
83
+ "transformer.h.14.ln_1.weight": "model-00001-of-00002.safetensors",
84
+ "transformer.h.14.ln_2.bias": "model-00001-of-00002.safetensors",
85
+ "transformer.h.14.ln_2.weight": "model-00001-of-00002.safetensors",
86
+ "transformer.h.14.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
87
+ "transformer.h.14.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
88
+ "transformer.h.14.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
89
+ "transformer.h.14.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
90
+ "transformer.h.15.attn.c_attn.bias": "model-00001-of-00002.safetensors",
91
+ "transformer.h.15.attn.c_attn.weight": "model-00001-of-00002.safetensors",
92
+ "transformer.h.15.attn.c_proj.bias": "model-00001-of-00002.safetensors",
93
+ "transformer.h.15.attn.c_proj.weight": "model-00001-of-00002.safetensors",
94
+ "transformer.h.15.ln_1.bias": "model-00001-of-00002.safetensors",
95
+ "transformer.h.15.ln_1.weight": "model-00001-of-00002.safetensors",
96
+ "transformer.h.15.ln_2.bias": "model-00001-of-00002.safetensors",
97
+ "transformer.h.15.ln_2.weight": "model-00001-of-00002.safetensors",
98
+ "transformer.h.15.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
99
+ "transformer.h.15.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
100
+ "transformer.h.15.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
101
+ "transformer.h.15.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
102
+ "transformer.h.16.attn.c_attn.bias": "model-00001-of-00002.safetensors",
103
+ "transformer.h.16.attn.c_attn.weight": "model-00001-of-00002.safetensors",
104
+ "transformer.h.16.attn.c_proj.bias": "model-00001-of-00002.safetensors",
105
+ "transformer.h.16.attn.c_proj.weight": "model-00001-of-00002.safetensors",
106
+ "transformer.h.16.ln_1.bias": "model-00001-of-00002.safetensors",
107
+ "transformer.h.16.ln_1.weight": "model-00001-of-00002.safetensors",
108
+ "transformer.h.16.ln_2.bias": "model-00001-of-00002.safetensors",
109
+ "transformer.h.16.ln_2.weight": "model-00001-of-00002.safetensors",
110
+ "transformer.h.16.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
111
+ "transformer.h.16.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
112
+ "transformer.h.16.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
113
+ "transformer.h.16.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
114
+ "transformer.h.17.attn.c_attn.bias": "model-00001-of-00002.safetensors",
115
+ "transformer.h.17.attn.c_attn.weight": "model-00001-of-00002.safetensors",
116
+ "transformer.h.17.attn.c_proj.bias": "model-00001-of-00002.safetensors",
117
+ "transformer.h.17.attn.c_proj.weight": "model-00001-of-00002.safetensors",
118
+ "transformer.h.17.ln_1.bias": "model-00001-of-00002.safetensors",
119
+ "transformer.h.17.ln_1.weight": "model-00001-of-00002.safetensors",
120
+ "transformer.h.17.ln_2.bias": "model-00001-of-00002.safetensors",
121
+ "transformer.h.17.ln_2.weight": "model-00001-of-00002.safetensors",
122
+ "transformer.h.17.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
123
+ "transformer.h.17.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
124
+ "transformer.h.17.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
125
+ "transformer.h.17.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
126
+ "transformer.h.18.attn.c_attn.bias": "model-00001-of-00002.safetensors",
127
+ "transformer.h.18.attn.c_attn.weight": "model-00001-of-00002.safetensors",
128
+ "transformer.h.18.attn.c_proj.bias": "model-00001-of-00002.safetensors",
129
+ "transformer.h.18.attn.c_proj.weight": "model-00001-of-00002.safetensors",
130
+ "transformer.h.18.ln_1.bias": "model-00001-of-00002.safetensors",
131
+ "transformer.h.18.ln_1.weight": "model-00001-of-00002.safetensors",
132
+ "transformer.h.18.ln_2.bias": "model-00001-of-00002.safetensors",
133
+ "transformer.h.18.ln_2.weight": "model-00001-of-00002.safetensors",
134
+ "transformer.h.18.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
135
+ "transformer.h.18.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
136
+ "transformer.h.18.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
137
+ "transformer.h.18.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
138
+ "transformer.h.19.attn.c_attn.bias": "model-00001-of-00002.safetensors",
139
+ "transformer.h.19.attn.c_attn.weight": "model-00001-of-00002.safetensors",
140
+ "transformer.h.19.attn.c_proj.bias": "model-00001-of-00002.safetensors",
141
+ "transformer.h.19.attn.c_proj.weight": "model-00001-of-00002.safetensors",
142
+ "transformer.h.19.ln_1.bias": "model-00001-of-00002.safetensors",
143
+ "transformer.h.19.ln_1.weight": "model-00001-of-00002.safetensors",
144
+ "transformer.h.19.ln_2.bias": "model-00001-of-00002.safetensors",
145
+ "transformer.h.19.ln_2.weight": "model-00001-of-00002.safetensors",
146
+ "transformer.h.19.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
147
+ "transformer.h.19.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
148
+ "transformer.h.19.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
149
+ "transformer.h.19.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
150
+ "transformer.h.2.attn.c_attn.bias": "model-00001-of-00002.safetensors",
151
+ "transformer.h.2.attn.c_attn.weight": "model-00001-of-00002.safetensors",
152
+ "transformer.h.2.attn.c_proj.bias": "model-00001-of-00002.safetensors",
153
+ "transformer.h.2.attn.c_proj.weight": "model-00001-of-00002.safetensors",
154
+ "transformer.h.2.ln_1.bias": "model-00001-of-00002.safetensors",
155
+ "transformer.h.2.ln_1.weight": "model-00001-of-00002.safetensors",
156
+ "transformer.h.2.ln_2.bias": "model-00001-of-00002.safetensors",
157
+ "transformer.h.2.ln_2.weight": "model-00001-of-00002.safetensors",
158
+ "transformer.h.2.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
159
+ "transformer.h.2.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
160
+ "transformer.h.2.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
161
+ "transformer.h.2.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
162
+ "transformer.h.20.attn.c_attn.bias": "model-00001-of-00002.safetensors",
163
+ "transformer.h.20.attn.c_attn.weight": "model-00001-of-00002.safetensors",
164
+ "transformer.h.20.attn.c_proj.bias": "model-00001-of-00002.safetensors",
165
+ "transformer.h.20.attn.c_proj.weight": "model-00001-of-00002.safetensors",
166
+ "transformer.h.20.ln_1.bias": "model-00001-of-00002.safetensors",
167
+ "transformer.h.20.ln_1.weight": "model-00001-of-00002.safetensors",
168
+ "transformer.h.20.ln_2.bias": "model-00001-of-00002.safetensors",
169
+ "transformer.h.20.ln_2.weight": "model-00001-of-00002.safetensors",
170
+ "transformer.h.20.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
171
+ "transformer.h.20.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
172
+ "transformer.h.20.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
173
+ "transformer.h.20.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
174
+ "transformer.h.21.attn.c_attn.bias": "model-00001-of-00002.safetensors",
175
+ "transformer.h.21.attn.c_attn.weight": "model-00001-of-00002.safetensors",
176
+ "transformer.h.21.attn.c_proj.bias": "model-00001-of-00002.safetensors",
177
+ "transformer.h.21.attn.c_proj.weight": "model-00001-of-00002.safetensors",
178
+ "transformer.h.21.ln_1.bias": "model-00001-of-00002.safetensors",
179
+ "transformer.h.21.ln_1.weight": "model-00001-of-00002.safetensors",
180
+ "transformer.h.21.ln_2.bias": "model-00001-of-00002.safetensors",
181
+ "transformer.h.21.ln_2.weight": "model-00001-of-00002.safetensors",
182
+ "transformer.h.21.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
183
+ "transformer.h.21.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
184
+ "transformer.h.21.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
185
+ "transformer.h.21.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
186
+ "transformer.h.22.attn.c_attn.bias": "model-00001-of-00002.safetensors",
187
+ "transformer.h.22.attn.c_attn.weight": "model-00001-of-00002.safetensors",
188
+ "transformer.h.22.attn.c_proj.bias": "model-00001-of-00002.safetensors",
189
+ "transformer.h.22.attn.c_proj.weight": "model-00001-of-00002.safetensors",
190
+ "transformer.h.22.ln_1.bias": "model-00001-of-00002.safetensors",
191
+ "transformer.h.22.ln_1.weight": "model-00001-of-00002.safetensors",
192
+ "transformer.h.22.ln_2.bias": "model-00001-of-00002.safetensors",
193
+ "transformer.h.22.ln_2.weight": "model-00001-of-00002.safetensors",
194
+ "transformer.h.22.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
195
+ "transformer.h.22.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
196
+ "transformer.h.22.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
197
+ "transformer.h.22.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
198
+ "transformer.h.23.attn.c_attn.bias": "model-00001-of-00002.safetensors",
199
+ "transformer.h.23.attn.c_attn.weight": "model-00001-of-00002.safetensors",
200
+ "transformer.h.23.attn.c_proj.bias": "model-00001-of-00002.safetensors",
201
+ "transformer.h.23.attn.c_proj.weight": "model-00001-of-00002.safetensors",
202
+ "transformer.h.23.ln_1.bias": "model-00001-of-00002.safetensors",
203
+ "transformer.h.23.ln_1.weight": "model-00001-of-00002.safetensors",
204
+ "transformer.h.23.ln_2.bias": "model-00001-of-00002.safetensors",
205
+ "transformer.h.23.ln_2.weight": "model-00001-of-00002.safetensors",
206
+ "transformer.h.23.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
207
+ "transformer.h.23.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
208
+ "transformer.h.23.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
209
+ "transformer.h.23.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
210
+ "transformer.h.24.attn.c_attn.bias": "model-00001-of-00002.safetensors",
211
+ "transformer.h.24.attn.c_attn.weight": "model-00001-of-00002.safetensors",
212
+ "transformer.h.24.attn.c_proj.bias": "model-00001-of-00002.safetensors",
213
+ "transformer.h.24.attn.c_proj.weight": "model-00001-of-00002.safetensors",
214
+ "transformer.h.24.ln_1.bias": "model-00001-of-00002.safetensors",
215
+ "transformer.h.24.ln_1.weight": "model-00001-of-00002.safetensors",
216
+ "transformer.h.24.ln_2.bias": "model-00001-of-00002.safetensors",
217
+ "transformer.h.24.ln_2.weight": "model-00001-of-00002.safetensors",
218
+ "transformer.h.24.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
219
+ "transformer.h.24.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
220
+ "transformer.h.24.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
221
+ "transformer.h.24.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
222
+ "transformer.h.25.attn.c_attn.bias": "model-00001-of-00002.safetensors",
223
+ "transformer.h.25.attn.c_attn.weight": "model-00001-of-00002.safetensors",
224
+ "transformer.h.25.attn.c_proj.bias": "model-00001-of-00002.safetensors",
225
+ "transformer.h.25.attn.c_proj.weight": "model-00001-of-00002.safetensors",
226
+ "transformer.h.25.ln_1.bias": "model-00001-of-00002.safetensors",
227
+ "transformer.h.25.ln_1.weight": "model-00001-of-00002.safetensors",
228
+ "transformer.h.25.ln_2.bias": "model-00001-of-00002.safetensors",
229
+ "transformer.h.25.ln_2.weight": "model-00001-of-00002.safetensors",
230
+ "transformer.h.25.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
231
+ "transformer.h.25.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
232
+ "transformer.h.25.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
233
+ "transformer.h.25.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
234
+ "transformer.h.26.attn.c_attn.bias": "model-00001-of-00002.safetensors",
235
+ "transformer.h.26.attn.c_attn.weight": "model-00001-of-00002.safetensors",
236
+ "transformer.h.26.attn.c_proj.bias": "model-00001-of-00002.safetensors",
237
+ "transformer.h.26.attn.c_proj.weight": "model-00001-of-00002.safetensors",
238
+ "transformer.h.26.ln_1.bias": "model-00001-of-00002.safetensors",
239
+ "transformer.h.26.ln_1.weight": "model-00001-of-00002.safetensors",
240
+ "transformer.h.26.ln_2.bias": "model-00001-of-00002.safetensors",
241
+ "transformer.h.26.ln_2.weight": "model-00001-of-00002.safetensors",
242
+ "transformer.h.26.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
243
+ "transformer.h.26.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
244
+ "transformer.h.26.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
245
+ "transformer.h.26.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
246
+ "transformer.h.27.attn.c_attn.bias": "model-00001-of-00002.safetensors",
247
+ "transformer.h.27.attn.c_attn.weight": "model-00001-of-00002.safetensors",
248
+ "transformer.h.27.attn.c_proj.bias": "model-00001-of-00002.safetensors",
249
+ "transformer.h.27.attn.c_proj.weight": "model-00001-of-00002.safetensors",
250
+ "transformer.h.27.ln_1.bias": "model-00001-of-00002.safetensors",
251
+ "transformer.h.27.ln_1.weight": "model-00001-of-00002.safetensors",
252
+ "transformer.h.27.ln_2.bias": "model-00001-of-00002.safetensors",
253
+ "transformer.h.27.ln_2.weight": "model-00001-of-00002.safetensors",
254
+ "transformer.h.27.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
255
+ "transformer.h.27.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
256
+ "transformer.h.27.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
257
+ "transformer.h.27.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
258
+ "transformer.h.28.attn.c_attn.bias": "model-00001-of-00002.safetensors",
259
+ "transformer.h.28.attn.c_attn.weight": "model-00001-of-00002.safetensors",
260
+ "transformer.h.28.attn.c_proj.bias": "model-00001-of-00002.safetensors",
261
+ "transformer.h.28.attn.c_proj.weight": "model-00001-of-00002.safetensors",
262
+ "transformer.h.28.ln_1.bias": "model-00001-of-00002.safetensors",
263
+ "transformer.h.28.ln_1.weight": "model-00001-of-00002.safetensors",
264
+ "transformer.h.28.ln_2.bias": "model-00001-of-00002.safetensors",
265
+ "transformer.h.28.ln_2.weight": "model-00001-of-00002.safetensors",
266
+ "transformer.h.28.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
267
+ "transformer.h.28.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
268
+ "transformer.h.28.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
269
+ "transformer.h.28.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
270
+ "transformer.h.29.attn.c_attn.bias": "model-00001-of-00002.safetensors",
271
+ "transformer.h.29.attn.c_attn.weight": "model-00001-of-00002.safetensors",
272
+ "transformer.h.29.attn.c_proj.bias": "model-00001-of-00002.safetensors",
273
+ "transformer.h.29.attn.c_proj.weight": "model-00001-of-00002.safetensors",
274
+ "transformer.h.29.ln_1.bias": "model-00001-of-00002.safetensors",
275
+ "transformer.h.29.ln_1.weight": "model-00001-of-00002.safetensors",
276
+ "transformer.h.29.ln_2.bias": "model-00001-of-00002.safetensors",
277
+ "transformer.h.29.ln_2.weight": "model-00001-of-00002.safetensors",
278
+ "transformer.h.29.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
279
+ "transformer.h.29.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
280
+ "transformer.h.29.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
281
+ "transformer.h.29.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
282
+ "transformer.h.3.attn.c_attn.bias": "model-00001-of-00002.safetensors",
283
+ "transformer.h.3.attn.c_attn.weight": "model-00001-of-00002.safetensors",
284
+ "transformer.h.3.attn.c_proj.bias": "model-00001-of-00002.safetensors",
285
+ "transformer.h.3.attn.c_proj.weight": "model-00001-of-00002.safetensors",
286
+ "transformer.h.3.ln_1.bias": "model-00001-of-00002.safetensors",
287
+ "transformer.h.3.ln_1.weight": "model-00001-of-00002.safetensors",
288
+ "transformer.h.3.ln_2.bias": "model-00001-of-00002.safetensors",
289
+ "transformer.h.3.ln_2.weight": "model-00001-of-00002.safetensors",
290
+ "transformer.h.3.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
291
+ "transformer.h.3.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
292
+ "transformer.h.3.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
293
+ "transformer.h.3.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
294
+ "transformer.h.30.attn.c_attn.bias": "model-00001-of-00002.safetensors",
295
+ "transformer.h.30.attn.c_attn.weight": "model-00001-of-00002.safetensors",
296
+ "transformer.h.30.attn.c_proj.bias": "model-00001-of-00002.safetensors",
297
+ "transformer.h.30.attn.c_proj.weight": "model-00001-of-00002.safetensors",
298
+ "transformer.h.30.ln_1.bias": "model-00001-of-00002.safetensors",
299
+ "transformer.h.30.ln_1.weight": "model-00001-of-00002.safetensors",
300
+ "transformer.h.30.ln_2.bias": "model-00001-of-00002.safetensors",
301
+ "transformer.h.30.ln_2.weight": "model-00001-of-00002.safetensors",
302
+ "transformer.h.30.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
303
+ "transformer.h.30.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
304
+ "transformer.h.30.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
305
+ "transformer.h.30.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
306
+ "transformer.h.31.attn.c_attn.bias": "model-00001-of-00002.safetensors",
307
+ "transformer.h.31.attn.c_attn.weight": "model-00001-of-00002.safetensors",
308
+ "transformer.h.31.attn.c_proj.bias": "model-00001-of-00002.safetensors",
309
+ "transformer.h.31.attn.c_proj.weight": "model-00001-of-00002.safetensors",
310
+ "transformer.h.31.ln_1.bias": "model-00001-of-00002.safetensors",
311
+ "transformer.h.31.ln_1.weight": "model-00001-of-00002.safetensors",
312
+ "transformer.h.31.ln_2.bias": "model-00001-of-00002.safetensors",
313
+ "transformer.h.31.ln_2.weight": "model-00001-of-00002.safetensors",
314
+ "transformer.h.31.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
315
+ "transformer.h.31.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
316
+ "transformer.h.31.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
317
+ "transformer.h.31.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
318
+ "transformer.h.32.attn.c_attn.bias": "model-00001-of-00002.safetensors",
319
+ "transformer.h.32.attn.c_attn.weight": "model-00001-of-00002.safetensors",
320
+ "transformer.h.32.attn.c_proj.bias": "model-00001-of-00002.safetensors",
321
+ "transformer.h.32.attn.c_proj.weight": "model-00001-of-00002.safetensors",
322
+ "transformer.h.32.ln_1.bias": "model-00001-of-00002.safetensors",
323
+ "transformer.h.32.ln_1.weight": "model-00001-of-00002.safetensors",
324
+ "transformer.h.32.ln_2.bias": "model-00001-of-00002.safetensors",
325
+ "transformer.h.32.ln_2.weight": "model-00001-of-00002.safetensors",
326
+ "transformer.h.32.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
327
+ "transformer.h.32.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
328
+ "transformer.h.32.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
329
+ "transformer.h.32.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
330
+ "transformer.h.33.attn.c_attn.bias": "model-00001-of-00002.safetensors",
331
+ "transformer.h.33.attn.c_attn.weight": "model-00001-of-00002.safetensors",
332
+ "transformer.h.33.attn.c_proj.bias": "model-00001-of-00002.safetensors",
333
+ "transformer.h.33.attn.c_proj.weight": "model-00001-of-00002.safetensors",
334
+ "transformer.h.33.ln_1.bias": "model-00001-of-00002.safetensors",
335
+ "transformer.h.33.ln_1.weight": "model-00001-of-00002.safetensors",
336
+ "transformer.h.33.ln_2.bias": "model-00001-of-00002.safetensors",
337
+ "transformer.h.33.ln_2.weight": "model-00001-of-00002.safetensors",
338
+ "transformer.h.33.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
339
+ "transformer.h.33.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
340
+ "transformer.h.33.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
341
+ "transformer.h.33.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
342
+ "transformer.h.34.attn.c_attn.bias": "model-00001-of-00002.safetensors",
343
+ "transformer.h.34.attn.c_attn.weight": "model-00001-of-00002.safetensors",
344
+ "transformer.h.34.attn.c_proj.bias": "model-00001-of-00002.safetensors",
345
+ "transformer.h.34.attn.c_proj.weight": "model-00001-of-00002.safetensors",
346
+ "transformer.h.34.ln_1.bias": "model-00001-of-00002.safetensors",
347
+ "transformer.h.34.ln_1.weight": "model-00001-of-00002.safetensors",
348
+ "transformer.h.34.ln_2.bias": "model-00001-of-00002.safetensors",
349
+ "transformer.h.34.ln_2.weight": "model-00001-of-00002.safetensors",
350
+ "transformer.h.34.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
351
+ "transformer.h.34.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
352
+ "transformer.h.34.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
353
+ "transformer.h.34.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
354
+ "transformer.h.35.attn.c_attn.bias": "model-00001-of-00002.safetensors",
355
+ "transformer.h.35.attn.c_attn.weight": "model-00001-of-00002.safetensors",
356
+ "transformer.h.35.attn.c_proj.bias": "model-00001-of-00002.safetensors",
357
+ "transformer.h.35.attn.c_proj.weight": "model-00001-of-00002.safetensors",
358
+ "transformer.h.35.ln_1.bias": "model-00001-of-00002.safetensors",
359
+ "transformer.h.35.ln_1.weight": "model-00001-of-00002.safetensors",
360
+ "transformer.h.35.ln_2.bias": "model-00001-of-00002.safetensors",
361
+ "transformer.h.35.ln_2.weight": "model-00001-of-00002.safetensors",
362
+ "transformer.h.35.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
363
+ "transformer.h.35.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
364
+ "transformer.h.35.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
365
+ "transformer.h.35.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
366
+ "transformer.h.36.attn.c_attn.bias": "model-00001-of-00002.safetensors",
367
+ "transformer.h.36.attn.c_attn.weight": "model-00001-of-00002.safetensors",
368
+ "transformer.h.36.attn.c_proj.bias": "model-00001-of-00002.safetensors",
369
+ "transformer.h.36.attn.c_proj.weight": "model-00001-of-00002.safetensors",
370
+ "transformer.h.36.ln_1.bias": "model-00001-of-00002.safetensors",
371
+ "transformer.h.36.ln_1.weight": "model-00001-of-00002.safetensors",
372
+ "transformer.h.36.ln_2.bias": "model-00001-of-00002.safetensors",
373
+ "transformer.h.36.ln_2.weight": "model-00001-of-00002.safetensors",
374
+ "transformer.h.36.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
375
+ "transformer.h.36.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
376
+ "transformer.h.36.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
377
+ "transformer.h.36.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
378
+ "transformer.h.37.attn.c_attn.bias": "model-00001-of-00002.safetensors",
379
+ "transformer.h.37.attn.c_attn.weight": "model-00001-of-00002.safetensors",
380
+ "transformer.h.37.attn.c_proj.bias": "model-00001-of-00002.safetensors",
381
+ "transformer.h.37.attn.c_proj.weight": "model-00001-of-00002.safetensors",
382
+ "transformer.h.37.ln_1.bias": "model-00001-of-00002.safetensors",
383
+ "transformer.h.37.ln_1.weight": "model-00001-of-00002.safetensors",
384
+ "transformer.h.37.ln_2.bias": "model-00001-of-00002.safetensors",
385
+ "transformer.h.37.ln_2.weight": "model-00001-of-00002.safetensors",
386
+ "transformer.h.37.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
387
+ "transformer.h.37.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
388
+ "transformer.h.37.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
389
+ "transformer.h.37.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
390
+ "transformer.h.38.attn.c_attn.bias": "model-00002-of-00002.safetensors",
391
+ "transformer.h.38.attn.c_attn.weight": "model-00002-of-00002.safetensors",
392
+ "transformer.h.38.attn.c_proj.bias": "model-00002-of-00002.safetensors",
393
+ "transformer.h.38.attn.c_proj.weight": "model-00002-of-00002.safetensors",
394
+ "transformer.h.38.ln_1.bias": "model-00002-of-00002.safetensors",
395
+ "transformer.h.38.ln_1.weight": "model-00002-of-00002.safetensors",
396
+ "transformer.h.38.ln_2.bias": "model-00002-of-00002.safetensors",
397
+ "transformer.h.38.ln_2.weight": "model-00002-of-00002.safetensors",
398
+ "transformer.h.38.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
399
+ "transformer.h.38.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
400
+ "transformer.h.38.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
401
+ "transformer.h.38.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
402
+ "transformer.h.39.attn.c_attn.bias": "model-00002-of-00002.safetensors",
403
+ "transformer.h.39.attn.c_attn.weight": "model-00002-of-00002.safetensors",
404
+ "transformer.h.39.attn.c_proj.bias": "model-00002-of-00002.safetensors",
405
+ "transformer.h.39.attn.c_proj.weight": "model-00002-of-00002.safetensors",
406
+ "transformer.h.39.ln_1.bias": "model-00002-of-00002.safetensors",
407
+ "transformer.h.39.ln_1.weight": "model-00002-of-00002.safetensors",
408
+ "transformer.h.39.ln_2.bias": "model-00002-of-00002.safetensors",
409
+ "transformer.h.39.ln_2.weight": "model-00002-of-00002.safetensors",
410
+ "transformer.h.39.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
411
+ "transformer.h.39.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
412
+ "transformer.h.39.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
413
+ "transformer.h.39.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
414
+ "transformer.h.4.attn.c_attn.bias": "model-00001-of-00002.safetensors",
415
+ "transformer.h.4.attn.c_attn.weight": "model-00001-of-00002.safetensors",
416
+ "transformer.h.4.attn.c_proj.bias": "model-00001-of-00002.safetensors",
417
+ "transformer.h.4.attn.c_proj.weight": "model-00001-of-00002.safetensors",
418
+ "transformer.h.4.ln_1.bias": "model-00001-of-00002.safetensors",
419
+ "transformer.h.4.ln_1.weight": "model-00001-of-00002.safetensors",
420
+ "transformer.h.4.ln_2.bias": "model-00001-of-00002.safetensors",
421
+ "transformer.h.4.ln_2.weight": "model-00001-of-00002.safetensors",
422
+ "transformer.h.4.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
423
+ "transformer.h.4.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
424
+ "transformer.h.4.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
425
+ "transformer.h.4.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
426
+ "transformer.h.40.attn.c_attn.bias": "model-00002-of-00002.safetensors",
427
+ "transformer.h.40.attn.c_attn.weight": "model-00002-of-00002.safetensors",
428
+ "transformer.h.40.attn.c_proj.bias": "model-00002-of-00002.safetensors",
429
+ "transformer.h.40.attn.c_proj.weight": "model-00002-of-00002.safetensors",
430
+ "transformer.h.40.ln_1.bias": "model-00002-of-00002.safetensors",
431
+ "transformer.h.40.ln_1.weight": "model-00002-of-00002.safetensors",
432
+ "transformer.h.40.ln_2.bias": "model-00002-of-00002.safetensors",
433
+ "transformer.h.40.ln_2.weight": "model-00002-of-00002.safetensors",
434
+ "transformer.h.40.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
435
+ "transformer.h.40.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
436
+ "transformer.h.40.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
437
+ "transformer.h.40.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
438
+ "transformer.h.41.attn.c_attn.bias": "model-00002-of-00002.safetensors",
439
+ "transformer.h.41.attn.c_attn.weight": "model-00002-of-00002.safetensors",
440
+ "transformer.h.41.attn.c_proj.bias": "model-00002-of-00002.safetensors",
441
+ "transformer.h.41.attn.c_proj.weight": "model-00002-of-00002.safetensors",
442
+ "transformer.h.41.ln_1.bias": "model-00002-of-00002.safetensors",
443
+ "transformer.h.41.ln_1.weight": "model-00002-of-00002.safetensors",
444
+ "transformer.h.41.ln_2.bias": "model-00002-of-00002.safetensors",
445
+ "transformer.h.41.ln_2.weight": "model-00002-of-00002.safetensors",
446
+ "transformer.h.41.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
447
+ "transformer.h.41.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
448
+ "transformer.h.41.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
449
+ "transformer.h.41.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
450
+ "transformer.h.42.attn.c_attn.bias": "model-00002-of-00002.safetensors",
451
+ "transformer.h.42.attn.c_attn.weight": "model-00002-of-00002.safetensors",
452
+ "transformer.h.42.attn.c_proj.bias": "model-00002-of-00002.safetensors",
453
+ "transformer.h.42.attn.c_proj.weight": "model-00002-of-00002.safetensors",
454
+ "transformer.h.42.ln_1.bias": "model-00002-of-00002.safetensors",
455
+ "transformer.h.42.ln_1.weight": "model-00002-of-00002.safetensors",
456
+ "transformer.h.42.ln_2.bias": "model-00002-of-00002.safetensors",
457
+ "transformer.h.42.ln_2.weight": "model-00002-of-00002.safetensors",
458
+ "transformer.h.42.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
459
+ "transformer.h.42.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
460
+ "transformer.h.42.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
461
+ "transformer.h.42.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
462
+ "transformer.h.43.attn.c_attn.bias": "model-00002-of-00002.safetensors",
463
+ "transformer.h.43.attn.c_attn.weight": "model-00002-of-00002.safetensors",
464
+ "transformer.h.43.attn.c_proj.bias": "model-00002-of-00002.safetensors",
465
+ "transformer.h.43.attn.c_proj.weight": "model-00002-of-00002.safetensors",
466
+ "transformer.h.43.ln_1.bias": "model-00002-of-00002.safetensors",
467
+ "transformer.h.43.ln_1.weight": "model-00002-of-00002.safetensors",
468
+ "transformer.h.43.ln_2.bias": "model-00002-of-00002.safetensors",
469
+ "transformer.h.43.ln_2.weight": "model-00002-of-00002.safetensors",
470
+ "transformer.h.43.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
471
+ "transformer.h.43.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
472
+ "transformer.h.43.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
473
+ "transformer.h.43.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
474
+ "transformer.h.44.attn.c_attn.bias": "model-00002-of-00002.safetensors",
475
+ "transformer.h.44.attn.c_attn.weight": "model-00002-of-00002.safetensors",
476
+ "transformer.h.44.attn.c_proj.bias": "model-00002-of-00002.safetensors",
477
+ "transformer.h.44.attn.c_proj.weight": "model-00002-of-00002.safetensors",
478
+ "transformer.h.44.ln_1.bias": "model-00002-of-00002.safetensors",
479
+ "transformer.h.44.ln_1.weight": "model-00002-of-00002.safetensors",
480
+ "transformer.h.44.ln_2.bias": "model-00002-of-00002.safetensors",
481
+ "transformer.h.44.ln_2.weight": "model-00002-of-00002.safetensors",
482
+ "transformer.h.44.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
483
+ "transformer.h.44.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
484
+ "transformer.h.44.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
485
+ "transformer.h.44.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
486
+ "transformer.h.45.attn.c_attn.bias": "model-00002-of-00002.safetensors",
487
+ "transformer.h.45.attn.c_attn.weight": "model-00002-of-00002.safetensors",
488
+ "transformer.h.45.attn.c_proj.bias": "model-00002-of-00002.safetensors",
489
+ "transformer.h.45.attn.c_proj.weight": "model-00002-of-00002.safetensors",
490
+ "transformer.h.45.ln_1.bias": "model-00002-of-00002.safetensors",
491
+ "transformer.h.45.ln_1.weight": "model-00002-of-00002.safetensors",
492
+ "transformer.h.45.ln_2.bias": "model-00002-of-00002.safetensors",
493
+ "transformer.h.45.ln_2.weight": "model-00002-of-00002.safetensors",
494
+ "transformer.h.45.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
495
+ "transformer.h.45.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
496
+ "transformer.h.45.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
497
+ "transformer.h.45.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
498
+ "transformer.h.46.attn.c_attn.bias": "model-00002-of-00002.safetensors",
499
+ "transformer.h.46.attn.c_attn.weight": "model-00002-of-00002.safetensors",
500
+ "transformer.h.46.attn.c_proj.bias": "model-00002-of-00002.safetensors",
501
+ "transformer.h.46.attn.c_proj.weight": "model-00002-of-00002.safetensors",
502
+ "transformer.h.46.ln_1.bias": "model-00002-of-00002.safetensors",
503
+ "transformer.h.46.ln_1.weight": "model-00002-of-00002.safetensors",
504
+ "transformer.h.46.ln_2.bias": "model-00002-of-00002.safetensors",
505
+ "transformer.h.46.ln_2.weight": "model-00002-of-00002.safetensors",
506
+ "transformer.h.46.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
507
+ "transformer.h.46.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
508
+ "transformer.h.46.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
509
+ "transformer.h.46.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
510
+ "transformer.h.47.attn.c_attn.bias": "model-00002-of-00002.safetensors",
511
+ "transformer.h.47.attn.c_attn.weight": "model-00002-of-00002.safetensors",
512
+ "transformer.h.47.attn.c_proj.bias": "model-00002-of-00002.safetensors",
513
+ "transformer.h.47.attn.c_proj.weight": "model-00002-of-00002.safetensors",
514
+ "transformer.h.47.ln_1.bias": "model-00002-of-00002.safetensors",
515
+ "transformer.h.47.ln_1.weight": "model-00002-of-00002.safetensors",
516
+ "transformer.h.47.ln_2.bias": "model-00002-of-00002.safetensors",
517
+ "transformer.h.47.ln_2.weight": "model-00002-of-00002.safetensors",
518
+ "transformer.h.47.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
519
+ "transformer.h.47.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
520
+ "transformer.h.47.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
521
+ "transformer.h.47.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
522
+ "transformer.h.5.attn.c_attn.bias": "model-00001-of-00002.safetensors",
523
+ "transformer.h.5.attn.c_attn.weight": "model-00001-of-00002.safetensors",
524
+ "transformer.h.5.attn.c_proj.bias": "model-00001-of-00002.safetensors",
525
+ "transformer.h.5.attn.c_proj.weight": "model-00001-of-00002.safetensors",
526
+ "transformer.h.5.ln_1.bias": "model-00001-of-00002.safetensors",
527
+ "transformer.h.5.ln_1.weight": "model-00001-of-00002.safetensors",
528
+ "transformer.h.5.ln_2.bias": "model-00001-of-00002.safetensors",
529
+ "transformer.h.5.ln_2.weight": "model-00001-of-00002.safetensors",
530
+ "transformer.h.5.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
531
+ "transformer.h.5.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
532
+ "transformer.h.5.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
533
+ "transformer.h.5.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
534
+ "transformer.h.6.attn.c_attn.bias": "model-00001-of-00002.safetensors",
535
+ "transformer.h.6.attn.c_attn.weight": "model-00001-of-00002.safetensors",
536
+ "transformer.h.6.attn.c_proj.bias": "model-00001-of-00002.safetensors",
537
+ "transformer.h.6.attn.c_proj.weight": "model-00001-of-00002.safetensors",
538
+ "transformer.h.6.ln_1.bias": "model-00001-of-00002.safetensors",
539
+ "transformer.h.6.ln_1.weight": "model-00001-of-00002.safetensors",
540
+ "transformer.h.6.ln_2.bias": "model-00001-of-00002.safetensors",
541
+ "transformer.h.6.ln_2.weight": "model-00001-of-00002.safetensors",
542
+ "transformer.h.6.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
543
+ "transformer.h.6.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
544
+ "transformer.h.6.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
545
+ "transformer.h.6.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
546
+ "transformer.h.7.attn.c_attn.bias": "model-00001-of-00002.safetensors",
547
+ "transformer.h.7.attn.c_attn.weight": "model-00001-of-00002.safetensors",
548
+ "transformer.h.7.attn.c_proj.bias": "model-00001-of-00002.safetensors",
549
+ "transformer.h.7.attn.c_proj.weight": "model-00001-of-00002.safetensors",
550
+ "transformer.h.7.ln_1.bias": "model-00001-of-00002.safetensors",
551
+ "transformer.h.7.ln_1.weight": "model-00001-of-00002.safetensors",
552
+ "transformer.h.7.ln_2.bias": "model-00001-of-00002.safetensors",
553
+ "transformer.h.7.ln_2.weight": "model-00001-of-00002.safetensors",
554
+ "transformer.h.7.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
555
+ "transformer.h.7.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
556
+ "transformer.h.7.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
557
+ "transformer.h.7.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
558
+ "transformer.h.8.attn.c_attn.bias": "model-00001-of-00002.safetensors",
559
+ "transformer.h.8.attn.c_attn.weight": "model-00001-of-00002.safetensors",
560
+ "transformer.h.8.attn.c_proj.bias": "model-00001-of-00002.safetensors",
561
+ "transformer.h.8.attn.c_proj.weight": "model-00001-of-00002.safetensors",
562
+ "transformer.h.8.ln_1.bias": "model-00001-of-00002.safetensors",
563
+ "transformer.h.8.ln_1.weight": "model-00001-of-00002.safetensors",
564
+ "transformer.h.8.ln_2.bias": "model-00001-of-00002.safetensors",
565
+ "transformer.h.8.ln_2.weight": "model-00001-of-00002.safetensors",
566
+ "transformer.h.8.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
567
+ "transformer.h.8.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
568
+ "transformer.h.8.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
569
+ "transformer.h.8.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
570
+ "transformer.h.9.attn.c_attn.bias": "model-00001-of-00002.safetensors",
571
+ "transformer.h.9.attn.c_attn.weight": "model-00001-of-00002.safetensors",
572
+ "transformer.h.9.attn.c_proj.bias": "model-00001-of-00002.safetensors",
573
+ "transformer.h.9.attn.c_proj.weight": "model-00001-of-00002.safetensors",
574
+ "transformer.h.9.ln_1.bias": "model-00001-of-00002.safetensors",
575
+ "transformer.h.9.ln_1.weight": "model-00001-of-00002.safetensors",
576
+ "transformer.h.9.ln_2.bias": "model-00001-of-00002.safetensors",
577
+ "transformer.h.9.ln_2.weight": "model-00001-of-00002.safetensors",
578
+ "transformer.h.9.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
579
+ "transformer.h.9.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
580
+ "transformer.h.9.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
581
+ "transformer.h.9.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
582
+ "transformer.ln_f.bias": "model-00002-of-00002.safetensors",
583
+ "transformer.ln_f.weight": "model-00002-of-00002.safetensors",
584
+ "transformer.wpe.weight": "model-00001-of-00002.safetensors",
585
+ "transformer.wte.weight": "model-00001-of-00002.safetensors"
586
+ }
587
+ }
sft_pretrain_and_pushtohub.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [2025-04-02 14:52:59,306][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -c /tmp/tmpwrka7koq/test.c -o /tmp/tmpwrka7koq/test.o
2
+ [2025-04-02 14:52:59,327][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat /tmp/tmpwrka7koq/test.o -laio -o /tmp/tmpwrka7koq/a.out
3
+ [2025-04-02 14:52:59,802][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -c /tmp/tmpbm8f7_mn/test.c -o /tmp/tmpbm8f7_mn/test.o
4
+ [2025-04-02 14:52:59,820][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat /tmp/tmpbm8f7_mn/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpbm8f7_mn/a.out
5
+ [2025-04-02 14:52:59,881][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -c /tmp/tmphrk9_0u6/test.c -o /tmp/tmphrk9_0u6/test.o
6
+ [2025-04-02 14:52:59,896][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat /tmp/tmphrk9_0u6/test.o -laio -o /tmp/tmphrk9_0u6/a.out
7
+ [2025-04-02 14:53:00,791][__main__][INFO] - *** Starting SFT training at 2025-04-02 14:53:00 ***
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "padding_side": "left",
20
+ "tokenizer_class": "GPT2Tokenizer",
21
+ "unk_token": "<|endoftext|>"
22
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f14f3c6d299bfb369a2106aab54c59a032e03c366e1d1fecdcf02f954b66a25b
3
+ size 5624
vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug-internal.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-02T14:52:46.998966444Z","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39/wandb/run-20250402_145246-e1n3xkh6/logs/debug-core.log"}
2
+ {"time":"2025-04-02T14:52:47.119833744Z","level":"INFO","msg":"created new stream","id":"e1n3xkh6"}
3
+ {"time":"2025-04-02T14:52:47.119882315Z","level":"INFO","msg":"stream: started","id":"e1n3xkh6"}
4
+ {"time":"2025-04-02T14:52:47.119921969Z","level":"INFO","msg":"handler: started","stream_id":"e1n3xkh6"}
5
+ {"time":"2025-04-02T14:52:47.119936867Z","level":"INFO","msg":"writer: Do: started","stream_id":"e1n3xkh6"}
6
+ {"time":"2025-04-02T14:52:47.120603401Z","level":"INFO","msg":"sender: started","stream_id":"e1n3xkh6"}
7
+ {"time":"2025-04-02T14:52:47.425038021Z","level":"INFO","msg":"Starting system monitor"}
wandb/debug.log ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Configure stats pid to 738
3
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Loading settings from /dlabscratch1/amani/.config/wandb/settings
4
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Loading settings from /mnt/dlabscratch1/amani/LLM-RL/wandb/settings
5
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39/wandb/run-20250402_145246-e1n3xkh6/logs/debug.log
7
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39/wandb/run-20250402_145246-e1n3xkh6/logs/debug-internal.log
8
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():761] calling init triggers
9
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():784] starting backend
12
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-04-02 14:52:46,987 INFO MainThread:738 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-02 14:52:46,987 INFO MainThread:738 [wandb_init.py:init():798] backend started and connected
15
+ 2025-04-02 14:52:46,989 INFO MainThread:738 [wandb_init.py:init():891] updated telemetry
16
+ 2025-04-02 14:52:47,015 INFO MainThread:738 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-04-02 14:52:47,419 INFO MainThread:738 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-04-02 14:52:47,737 INFO MainThread:738 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-04-02 14:53:00,970 INFO MainThread:738 [wandb_run.py:_config_callback():1261] config_cb None None {'vocab_size': 50257, 'n_positions': 1024, 'n_embd': 1600, 'n_layer': 48, 'n_head': 25, 'n_inner': None, 'activation_function': 'gelu_new', 'resid_pdrop': 0.1, 'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'layer_norm_epsilon': 1e-05, 'initializer_range': 0.02, 'summary_type': 'cls_index', 'summary_use_proj': True, 'summary_activation': None, 'summary_first_dropout': 0.1, 'summary_proj_to_labels': True, 'scale_attn_weights': True, 'use_cache': True, 'scale_attn_by_inverse_layer_idx': False, 'reorder_and_upcast_attn': False, 'bos_token_id': 50256, 'eos_token_id': 50256, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['GPT2LMHeadModel'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'pad_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': {'text-generation': {'do_sample': True, 'max_length': 50}}, 'problem_type': None, '_name_or_path': 'openai-community/gpt2-xl', '_attn_implementation_autoset': True, 'transformers_version': '4.49.0', 'model_type': 'gpt2', 'n_ctx': 1024, 'output_past': True, 'output_dir': '/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39', 'overwrite_output_dir': False, 'do_train': 'true,', 'do_eval': 'true,', 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 10, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 10, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': True, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'model_init_kwargs': None, 'use_liger': False, 'dataset_text_field': 'text', 'dataset_kwargs': None, 'dataset_num_proc': None, 'max_seq_length': 1024, 'packing': False, 'eval_packing': None, 'dataset_batch_size': None, 'num_of_sequences': None, 'chars_per_token': '<CHARS_PER_TOKEN>'}
24
+ 2025-04-02 14:53:00,973 INFO MainThread:738 [wandb_config.py:__setitem__():154] config set model/num_parameters = 1557611200 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x734030d35250>>
25
+ 2025-04-02 14:53:00,973 INFO MainThread:738 [wandb_run.py:_config_callback():1261] config_cb model/num_parameters 1557611200 None
wandb/run-20250402_145246-e1n3xkh6/files/output.log ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /dlabscratch1/amani/.conda/envs/LLM-RL/lib/python3.11/site-packages/transformers/training_args.py:1594: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
2
+ warnings.warn(
3
+ /mnt/dlabscratch1/amani/LLM-RL/src/sft_pretrain_and_pushtohub.py:138: FutureWarning: `tokenizer` is deprecated and removed starting from version 0.16.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
4
+ trainer = SFTTrainer(
5
+ [2025-04-02 14:52:58,883] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
6
+ Warning: The cache directory for DeepSpeed Triton autotune, /dlabscratch1/amani/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
7
+ [2025-04-02 14:52:59,306][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -c /tmp/tmpwrka7koq/test.c -o /tmp/tmpwrka7koq/test.o
8
+ [2025-04-02 14:52:59,327][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat /tmp/tmpwrka7koq/test.o -laio -o /tmp/tmpwrka7koq/a.out
9
+ [2025-04-02 14:52:59,802][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -c /tmp/tmpbm8f7_mn/test.c -o /tmp/tmpbm8f7_mn/test.o
10
+ [2025-04-02 14:52:59,820][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat /tmp/tmpbm8f7_mn/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpbm8f7_mn/a.out
11
+ [2025-04-02 14:52:59,881][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -c /tmp/tmphrk9_0u6/test.c -o /tmp/tmphrk9_0u6/test.o
12
+ [2025-04-02 14:52:59,896][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat /tmp/tmphrk9_0u6/test.o -laio -o /tmp/tmphrk9_0u6/a.out
13
+ 2025-04-02 14:53:00,791 - __main__ - INFO - *** Starting SFT training at 2025-04-02 14:53:00 ***
14
+ [2025-04-02 14:53:00,791][__main__][INFO] - *** Starting SFT training at 2025-04-02 14:53:00 ***
15
+ wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
16
+ 0%| | 0/9350 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
17
+
18
+ {'loss': 2.8736, 'grad_norm': 4.561432361602783, 'learning_rate': 1.997860962566845e-05, 'mean_token_accuracy': 0.4675643026828766, 'epoch': 0.01}
19
+ {'loss': 1.4077, 'grad_norm': 3.1172070503234863, 'learning_rate': 1.99572192513369e-05, 'mean_token_accuracy': 0.6618991315364837, 'epoch': 0.02}
20
+ {'loss': 1.2961, 'grad_norm': 2.483769655227661, 'learning_rate': 1.993582887700535e-05, 'mean_token_accuracy': 0.6864472806453705, 'epoch': 0.03}
21
+ {'loss': 1.2227, 'grad_norm': 2.7745237350463867, 'learning_rate': 1.9914438502673797e-05, 'mean_token_accuracy': 0.6966653347015381, 'epoch': 0.04}
22
+ {'loss': 1.0693, 'grad_norm': 2.1610703468322754, 'learning_rate': 1.9893048128342248e-05, 'mean_token_accuracy': 0.7266861200332642, 'epoch': 0.05}
23
+ {'loss': 1.1503, 'grad_norm': 2.143181324005127, 'learning_rate': 1.9871657754010695e-05, 'mean_token_accuracy': 0.7105863869190217, 'epoch': 0.06}
24
+ {'loss': 1.065, 'grad_norm': 2.452174425125122, 'learning_rate': 1.9850267379679146e-05, 'mean_token_accuracy': 0.7262609004974365, 'epoch': 0.07}
25
+ {'loss': 1.011, 'grad_norm': 1.8720413446426392, 'learning_rate': 1.9828877005347596e-05, 'mean_token_accuracy': 0.7376364350318909, 'epoch': 0.09}
26
+ {'loss': 0.9914, 'grad_norm': 1.9087146520614624, 'learning_rate': 1.9807486631016044e-05, 'mean_token_accuracy': 0.7410379707813263, 'epoch': 0.1}
27
+ {'loss': 0.9976, 'grad_norm': 2.295886993408203, 'learning_rate': 1.9786096256684494e-05, 'mean_token_accuracy': 0.7375340044498444, 'epoch': 0.11}
28
+ {'loss': 1.0109, 'grad_norm': 2.035055160522461, 'learning_rate': 1.9764705882352945e-05, 'mean_token_accuracy': 0.7421261131763458, 'epoch': 0.12}
29
+ {'loss': 1.0253, 'grad_norm': 2.1886730194091797, 'learning_rate': 1.9743315508021392e-05, 'mean_token_accuracy': 0.7308926224708557, 'epoch': 0.13}
30
+ {'loss': 0.9605, 'grad_norm': 2.1175997257232666, 'learning_rate': 1.972192513368984e-05, 'mean_token_accuracy': 0.7511978864669799, 'epoch': 0.14}
31
+ {'loss': 0.9502, 'grad_norm': 1.9803975820541382, 'learning_rate': 1.970053475935829e-05, 'mean_token_accuracy': 0.7526978373527526, 'epoch': 0.15}
32
+ {'loss': 0.9787, 'grad_norm': 2.274048089981079, 'learning_rate': 1.967914438502674e-05, 'mean_token_accuracy': 0.7469079375267029, 'epoch': 0.16}
33
+ {'loss': 0.9892, 'grad_norm': 2.0520341396331787, 'learning_rate': 1.9657754010695188e-05, 'mean_token_accuracy': 0.7408109784126282, 'epoch': 0.17}
34
+ {'loss': 0.9155, 'grad_norm': 2.1267733573913574, 'learning_rate': 1.963636363636364e-05, 'mean_token_accuracy': 0.764633697271347, 'epoch': 0.18}
35
+ {'loss': 0.9464, 'grad_norm': 2.1772565841674805, 'learning_rate': 1.9614973262032086e-05, 'mean_token_accuracy': 0.75131676197052, 'epoch': 0.19}
36
+ {'loss': 1.0017, 'grad_norm': 2.230823040008545, 'learning_rate': 1.9593582887700536e-05, 'mean_token_accuracy': 0.7450973689556122, 'epoch': 0.2}
37
+ {'loss': 0.9788, 'grad_norm': 1.783121943473816, 'learning_rate': 1.9572192513368987e-05, 'mean_token_accuracy': 0.747833377122879, 'epoch': 0.21}
38
+ {'loss': 0.9839, 'grad_norm': 2.2042770385742188, 'learning_rate': 1.9550802139037434e-05, 'mean_token_accuracy': 0.7454769611358643, 'epoch': 0.22}
39
+ {'loss': 0.9629, 'grad_norm': 1.90845787525177, 'learning_rate': 1.9529411764705885e-05, 'mean_token_accuracy': 0.7495079040527344, 'epoch': 0.24}
40
+ {'loss': 0.9103, 'grad_norm': 1.7904044389724731, 'learning_rate': 1.9508021390374332e-05, 'mean_token_accuracy': 0.7587033331394195, 'epoch': 0.25}
41
+ {'loss': 0.9865, 'grad_norm': 2.190483331680298, 'learning_rate': 1.9486631016042783e-05, 'mean_token_accuracy': 0.7439156830310821, 'epoch': 0.26}
42
+ {'loss': 0.8963, 'grad_norm': 2.020122528076172, 'learning_rate': 1.9465240641711233e-05, 'mean_token_accuracy': 0.7711025416851044, 'epoch': 0.27}
43
+ {'loss': 0.9623, 'grad_norm': 1.914089560508728, 'learning_rate': 1.944385026737968e-05, 'mean_token_accuracy': 0.748535567522049, 'epoch': 0.28}
44
+ {'loss': 0.8793, 'grad_norm': 3.000681161880493, 'learning_rate': 1.9422459893048128e-05, 'mean_token_accuracy': 0.7698011755943298, 'epoch': 0.29}
45
+ {'loss': 0.9989, 'grad_norm': 2.015597105026245, 'learning_rate': 1.9401069518716578e-05, 'mean_token_accuracy': 0.7419908523559571, 'epoch': 0.3}
46
+ {'loss': 0.9115, 'grad_norm': 1.6983799934387207, 'learning_rate': 1.937967914438503e-05, 'mean_token_accuracy': 0.7678780138492585, 'epoch': 0.31}
47
+ {'loss': 0.9361, 'grad_norm': 3.0123836994171143, 'learning_rate': 1.9358288770053476e-05, 'mean_token_accuracy': 0.7575121581554413, 'epoch': 0.32}
48
+ {'loss': 0.9126, 'grad_norm': 2.2199974060058594, 'learning_rate': 1.9336898395721927e-05, 'mean_token_accuracy': 0.763150978088379, 'epoch': 0.33}
49
+ {'loss': 0.9165, 'grad_norm': 1.953675389289856, 'learning_rate': 1.9315508021390377e-05, 'mean_token_accuracy': 0.7600376307964325, 'epoch': 0.34}
50
+ {'loss': 0.9716, 'grad_norm': 2.2523326873779297, 'learning_rate': 1.9294117647058825e-05, 'mean_token_accuracy': 0.7498639464378357, 'epoch': 0.35}
51
+ {'loss': 0.9178, 'grad_norm': 1.7307066917419434, 'learning_rate': 1.9272727272727275e-05, 'mean_token_accuracy': 0.7602478981018066, 'epoch': 0.36}
52
+ {'loss': 0.9156, 'grad_norm': 1.8145519495010376, 'learning_rate': 1.9251336898395722e-05, 'mean_token_accuracy': 0.7568887352943421, 'epoch': 0.37}
53
+ {'loss': 0.8619, 'grad_norm': 1.8205516338348389, 'learning_rate': 1.9229946524064173e-05, 'mean_token_accuracy': 0.7767329752445221, 'epoch': 0.39}
54
+ {'loss': 0.8601, 'grad_norm': 1.7622675895690918, 'learning_rate': 1.9208556149732624e-05, 'mean_token_accuracy': 0.7761210620403289, 'epoch': 0.4}
55
+ {'loss': 0.8928, 'grad_norm': 2.1755974292755127, 'learning_rate': 1.918716577540107e-05, 'mean_token_accuracy': 0.7695048809051513, 'epoch': 0.41}
56
+ {'loss': 0.8501, 'grad_norm': 1.590783953666687, 'learning_rate': 1.9165775401069518e-05, 'mean_token_accuracy': 0.7792985320091248, 'epoch': 0.42}
57
+ {'loss': 0.926, 'grad_norm': 1.8423107862472534, 'learning_rate': 1.9144385026737972e-05, 'mean_token_accuracy': 0.7609580457210541, 'epoch': 0.43}
58
+ {'loss': 0.9162, 'grad_norm': 1.4484622478485107, 'learning_rate': 1.912299465240642e-05, 'mean_token_accuracy': 0.7642469525337219, 'epoch': 0.44}
59
+ {'loss': 0.9032, 'grad_norm': 1.7720240354537964, 'learning_rate': 1.9101604278074867e-05, 'mean_token_accuracy': 0.7663418650627136, 'epoch': 0.45}
60
+ {'loss': 0.8943, 'grad_norm': 1.9300682544708252, 'learning_rate': 1.9080213903743317e-05, 'mean_token_accuracy': 0.7683996140956879, 'epoch': 0.46}
61
+ {'loss': 0.8782, 'grad_norm': 2.139838218688965, 'learning_rate': 1.9058823529411764e-05, 'mean_token_accuracy': 0.7707085013389587, 'epoch': 0.47}
62
+ {'loss': 0.865, 'grad_norm': 1.7625609636306763, 'learning_rate': 1.9037433155080215e-05, 'mean_token_accuracy': 0.7711307823657989, 'epoch': 0.48}
63
+ {'loss': 0.9055, 'grad_norm': 1.9418359994888306, 'learning_rate': 1.9016042780748666e-05, 'mean_token_accuracy': 0.7585177183151245, 'epoch': 0.49}
64
+ {'loss': 0.8507, 'grad_norm': 2.0598695278167725, 'learning_rate': 1.8994652406417113e-05, 'mean_token_accuracy': 0.7708562433719635, 'epoch': 0.5}
65
+ {'loss': 0.8692, 'grad_norm': 1.5901210308074951, 'learning_rate': 1.8973262032085563e-05, 'mean_token_accuracy': 0.7684442400932312, 'epoch': 0.51}
66
+ {'loss': 0.8778, 'grad_norm': 1.8863409757614136, 'learning_rate': 1.8951871657754014e-05, 'mean_token_accuracy': 0.7689958155155182, 'epoch': 0.52}
67
+ {'loss': 0.8592, 'grad_norm': 1.6788251399993896, 'learning_rate': 1.893048128342246e-05, 'mean_token_accuracy': 0.7720810234546661, 'epoch': 0.53}
68
+ {'loss': 0.8527, 'grad_norm': 1.7482951879501343, 'learning_rate': 1.8909090909090912e-05, 'mean_token_accuracy': 0.7757346272468567, 'epoch': 0.55}
69
+ {'loss': 0.8633, 'grad_norm': 1.570914626121521, 'learning_rate': 1.888770053475936e-05, 'mean_token_accuracy': 0.7743308961391449, 'epoch': 0.56}
70
+ {'loss': 0.8701, 'grad_norm': 1.7534855604171753, 'learning_rate': 1.886631016042781e-05, 'mean_token_accuracy': 0.7677632629871368, 'epoch': 0.57}
71
+ {'loss': 0.8179, 'grad_norm': 2.0430619716644287, 'learning_rate': 1.8844919786096257e-05, 'mean_token_accuracy': 0.7785910904407501, 'epoch': 0.58}
72
+ {'loss': 0.8518, 'grad_norm': 1.8052802085876465, 'learning_rate': 1.8823529411764708e-05, 'mean_token_accuracy': 0.7752408146858215, 'epoch': 0.59}
73
+ {'loss': 0.8174, 'grad_norm': 1.7275725603103638, 'learning_rate': 1.8802139037433155e-05, 'mean_token_accuracy': 0.7837619543075561, 'epoch': 0.6}
74
+ {'loss': 0.8424, 'grad_norm': 2.0164926052093506, 'learning_rate': 1.8780748663101605e-05, 'mean_token_accuracy': 0.7713942766189575, 'epoch': 0.61}
75
+ {'loss': 0.7643, 'grad_norm': 1.6844583749771118, 'learning_rate': 1.8759358288770056e-05, 'mean_token_accuracy': 0.7957529544830322, 'epoch': 0.62}
76
+ {'loss': 0.869, 'grad_norm': 1.9102866649627686, 'learning_rate': 1.8737967914438503e-05, 'mean_token_accuracy': 0.7759682476520539, 'epoch': 0.63}
77
+ {'loss': 0.7768, 'grad_norm': 1.379757285118103, 'learning_rate': 1.8716577540106954e-05, 'mean_token_accuracy': 0.7940125286579132, 'epoch': 0.64}
78
+ {'loss': 0.8424, 'grad_norm': 1.7400151491165161, 'learning_rate': 1.8695187165775405e-05, 'mean_token_accuracy': 0.7720341801643371, 'epoch': 0.65}
79
+ {'loss': 0.8636, 'grad_norm': 2.172954559326172, 'learning_rate': 1.8673796791443852e-05, 'mean_token_accuracy': 0.775877845287323, 'epoch': 0.66}
80
+ {'loss': 0.8008, 'grad_norm': 1.9168498516082764, 'learning_rate': 1.8652406417112302e-05, 'mean_token_accuracy': 0.7895217001438141, 'epoch': 0.67}
81
+ {'loss': 0.7755, 'grad_norm': 1.5433951616287231, 'learning_rate': 1.863101604278075e-05, 'mean_token_accuracy': 0.7920072257518769, 'epoch': 0.68}
82
+ {'loss': 0.8503, 'grad_norm': 2.0785927772521973, 'learning_rate': 1.86096256684492e-05, 'mean_token_accuracy': 0.7770399391651154, 'epoch': 0.7}
83
+ {'loss': 0.7462, 'grad_norm': 1.9140806198120117, 'learning_rate': 1.8588235294117647e-05, 'mean_token_accuracy': 0.8053540170192719, 'epoch': 0.71}
84
+ {'loss': 0.7462, 'grad_norm': 1.7646123170852661, 'learning_rate': 1.8566844919786098e-05, 'mean_token_accuracy': 0.7958488464355469, 'epoch': 0.72}
85
+ {'loss': 0.8468, 'grad_norm': 1.6575416326522827, 'learning_rate': 1.8545454545454545e-05, 'mean_token_accuracy': 0.7770686745643616, 'epoch': 0.73}
86
+ {'loss': 0.8494, 'grad_norm': 1.7693356275558472, 'learning_rate': 1.8524064171122996e-05, 'mean_token_accuracy': 0.7740375459194183, 'epoch': 0.74}
87
+ {'loss': 0.7965, 'grad_norm': 1.6074458360671997, 'learning_rate': 1.8502673796791447e-05, 'mean_token_accuracy': 0.7881363987922668, 'epoch': 0.75}
88
+ {'loss': 0.8626, 'grad_norm': 1.7979710102081299, 'learning_rate': 1.8481283422459894e-05, 'mean_token_accuracy': 0.7758695363998414, 'epoch': 0.76}
89
+ {'loss': 0.7883, 'grad_norm': 1.6999515295028687, 'learning_rate': 1.8459893048128344e-05, 'mean_token_accuracy': 0.7871046781539917, 'epoch': 0.77}
90
+ {'loss': 0.8218, 'grad_norm': 1.8012199401855469, 'learning_rate': 1.843850267379679e-05, 'mean_token_accuracy': 0.7894359171390534, 'epoch': 0.78}
91
+ {'loss': 0.8249, 'grad_norm': 1.8291058540344238, 'learning_rate': 1.8417112299465242e-05, 'mean_token_accuracy': 0.7786098062992096, 'epoch': 0.79}
92
+ {'loss': 0.849, 'grad_norm': 1.459100604057312, 'learning_rate': 1.8395721925133693e-05, 'mean_token_accuracy': 0.7739375293254852, 'epoch': 0.8}
93
+ {'loss': 0.8105, 'grad_norm': 1.6709809303283691, 'learning_rate': 1.837433155080214e-05, 'mean_token_accuracy': 0.7830919861793518, 'epoch': 0.81}
94
+ {'loss': 0.8016, 'grad_norm': 1.8294044733047485, 'learning_rate': 1.8352941176470587e-05, 'mean_token_accuracy': 0.7827704012393951, 'epoch': 0.82}
95
+ {'loss': 0.7922, 'grad_norm': 1.9289802312850952, 'learning_rate': 1.833155080213904e-05, 'mean_token_accuracy': 0.7901956796646118, 'epoch': 0.83}
96
+ {'loss': 0.7928, 'grad_norm': 1.6650038957595825, 'learning_rate': 1.831016042780749e-05, 'mean_token_accuracy': 0.7919930636882782, 'epoch': 0.84}
97
+ {'loss': 0.7958, 'grad_norm': 1.7201720476150513, 'learning_rate': 1.8288770053475936e-05, 'mean_token_accuracy': 0.7823857426643371, 'epoch': 0.86}
98
+ {'loss': 0.8059, 'grad_norm': 1.5870884656906128, 'learning_rate': 1.8267379679144386e-05, 'mean_token_accuracy': 0.7836263060569764, 'epoch': 0.87}
99
+ {'loss': 0.75, 'grad_norm': 1.7115275859832764, 'learning_rate': 1.8245989304812837e-05, 'mean_token_accuracy': 0.8022487759590149, 'epoch': 0.88}
100
+ {'loss': 0.8185, 'grad_norm': 2.0211620330810547, 'learning_rate': 1.8224598930481284e-05, 'mean_token_accuracy': 0.7780875384807586, 'epoch': 0.89}
101
+ {'loss': 0.7877, 'grad_norm': 1.6509720087051392, 'learning_rate': 1.8203208556149735e-05, 'mean_token_accuracy': 0.7959480166435242, 'epoch': 0.9}
102
+ {'loss': 0.8499, 'grad_norm': 1.8145709037780762, 'learning_rate': 1.8181818181818182e-05, 'mean_token_accuracy': 0.7782252907752991, 'epoch': 0.91}
103
+ {'loss': 0.7717, 'grad_norm': 1.8439884185791016, 'learning_rate': 1.8160427807486633e-05, 'mean_token_accuracy': 0.7995142638683319, 'epoch': 0.92}
104
+ {'loss': 0.8012, 'grad_norm': 1.5468418598175049, 'learning_rate': 1.8139037433155083e-05, 'mean_token_accuracy': 0.7799870669841766, 'epoch': 0.93}
105
+ {'loss': 0.9315, 'grad_norm': 1.8522837162017822, 'learning_rate': 1.811764705882353e-05, 'mean_token_accuracy': 0.7644379436969757, 'epoch': 0.94}
106
+ {'loss': 0.8311, 'grad_norm': 1.6274827718734741, 'learning_rate': 1.809625668449198e-05, 'mean_token_accuracy': 0.7880046725273132, 'epoch': 0.95}
107
+ {'loss': 0.8642, 'grad_norm': 1.9474292993545532, 'learning_rate': 1.807486631016043e-05, 'mean_token_accuracy': 0.7756262719631195, 'epoch': 0.96}
108
+ {'loss': 0.7447, 'grad_norm': 1.9583537578582764, 'learning_rate': 1.805347593582888e-05, 'mean_token_accuracy': 0.8041338086128235, 'epoch': 0.97}
109
+ {'loss': 0.7824, 'grad_norm': 1.5204691886901855, 'learning_rate': 1.8032085561497326e-05, 'mean_token_accuracy': 0.7936776518821717, 'epoch': 0.98}
110
+ {'loss': 0.8073, 'grad_norm': 1.6397240161895752, 'learning_rate': 1.8010695187165777e-05, 'mean_token_accuracy': 0.7917199492454529, 'epoch': 0.99}
111
+ training_args.bin: 100%|██████████| 5.62k/5.62k [00:00<00:00, 38.0kB/s] ?B/s]
112
+ {'eval_loss': 0.7610637545585632, 'eval_runtime': 97.5543, 'eval_samples_per_second': 13.521, 'eval_steps_per_second': 0.851, 'eval_mean_token_accuracy': 0.7973010553555056, 'epoch': 1.0}
113
+ run-e1n3xkh6.wandb: 100%|██████████| 360k/360k [00:00<00:00, 604kB/s]<07:44, 10.7MB/s]
114
+ model-00002-of-00002.safetensors: 100%|██████████| 1.27G/1.27G [00:36<00:00, 35.2MB/s]
115
+ model-00002-of-00002.safetensors: 100%|██████████| 1.27G/1.27G [00:44<00:00, 28.7MB/s]
116
+ model-00001-of-00002.safetensors: 100%|██████████| 4.96G/4.96G [02:15<00:00, 36.5MB/s]
117
+ model-00001-of-00002.safetensors: 100%|██████████| 4.96G/4.96G [02:16<00:00, 36.3MB/s]
118
+ Upload 7 LFS files: 100%|██████████| 7/7 [02:17<00:00, 19.59s/it]0:35<01:32, 41.1MB/s]
119
+ model-00001-of-00002.safetensors: 30%|███ | 1.49G/4.96G [00:44<01:24, 40.9MB/s]
120
+ model-00001-of-00002.safetensors: 99%|█████████▉| 4.93G/4.96G [02:15<00:00, 47.1MB/s]
121
+ model-00001-of-00002.safetensors: 100%|█████████▉| 4.96G/4.96G [02:16<00:00, 49.1MB/s]
122
+ Upload 7 LFS files: 14%|█▍ | 1/7 [02:16<13:37, 136.19s/it]
123
+ Upload 7 LFS files: 57%|█████▋ | 4/7 [02:17<01:18, 26.02s/it]
wandb/run-20250402_145246-e1n3xkh6/files/requirements.txt ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wheel==0.45.1
2
+ pip==25.0.1
3
+ tensorboard-data-server==0.7.2
4
+ six==1.17.0
5
+ setuptools==70.3.0
6
+ packaging==24.2
7
+ MarkupSafe==3.0.2
8
+ Markdown==3.7
9
+ grpcio==1.71.0
10
+ absl-py==2.1.0
11
+ Werkzeug==3.1.3
12
+ tensorboard==2.19.0
13
+ pytz==2025.1
14
+ py-cpuinfo==9.0.0
15
+ nvidia-cusparselt-cu12==0.6.2
16
+ mpmath==1.3.0
17
+ hjson==3.1.0
18
+ xxhash==3.5.0
19
+ urllib3==2.3.0
20
+ tzdata==2025.1
21
+ typing_extensions==4.12.2
22
+ tqdm==4.67.1
23
+ sympy==1.13.1
24
+ safetensors==0.5.3
25
+ regex==2024.11.6
26
+ PyYAML==6.0.2
27
+ python-dateutil==2.9.0.post0
28
+ Pygments==2.19.1
29
+ pyarrow==19.0.1
30
+ psutil==7.0.0
31
+ propcache==0.3.0
32
+ nvidia-nvtx-cu12==12.4.127
33
+ nvidia-nvjitlink-cu12==12.4.127
34
+ nvidia-nccl-cu12==2.21.5
35
+ nvidia-curand-cu12==10.3.5.147
36
+ nvidia-cufft-cu12==11.2.1.3
37
+ nvidia-cuda-runtime-cu12==12.4.127
38
+ nvidia-cuda-nvrtc-cu12==12.4.127
39
+ nvidia-cuda-cupti-cu12==12.4.127
40
+ nvidia-cublas-cu12==12.4.5.8
41
+ ninja==1.11.1.3
42
+ networkx==3.4.2
43
+ multidict==6.1.0
44
+ msgpack==1.1.0
45
+ mdurl==0.1.2
46
+ Jinja2==3.1.6
47
+ idna==3.10
48
+ hf_transfer==0.1.9
49
+ fsspec==2024.9.0
50
+ frozenlist==1.5.0
51
+ filelock==3.18.0
52
+ dill==0.3.8
53
+ charset-normalizer==3.4.1
54
+ certifi==2025.1.31
55
+ attrs==25.3.0
56
+ annotated-types==0.7.0
57
+ aiohappyeyeballs==2.6.1
58
+ yarl==1.18.3
59
+ requests==2.32.3
60
+ pydantic_core==2.27.2
61
+ pandas==2.2.3
62
+ nvidia-cusparse-cu12==12.3.1.170
63
+ nvidia-cudnn-cu12==9.1.0.70
64
+ multiprocess==0.70.16
65
+ markdown-it-py==3.0.0
66
+ aiosignal==1.3.2
67
+ rich==13.9.4
68
+ pydantic==2.10.6
69
+ nvidia-cusolver-cu12==11.6.1.9
70
+ huggingface-hub==0.29.3
71
+ aiohttp==3.11.13
72
+ tokenizers==0.21.1
73
+ deepspeed==0.15.4
74
+ datasets==3.1.0
75
+ accelerate==1.3.0
76
+ trl==0.15.2
77
+ nvidia-ml-py==12.570.86
78
+ smmap==5.0.2
79
+ setproctitle==1.3.5
80
+ sentry-sdk==2.22.0
81
+ protobuf==5.29.3
82
+ platformdirs==4.3.6
83
+ docker-pycreds==0.4.0
84
+ click==8.1.8
85
+ gitdb==4.0.12
86
+ GitPython==3.1.44
87
+ wandb==0.19.8
88
+ sentencepiece==0.2.0
89
+ fastrlock==0.8.3
90
+ blake3==1.0.4
91
+ zipp==3.21.0
92
+ websockets==15.0.1
93
+ uvloop==0.21.0
94
+ triton==3.1.0
95
+ sniffio==1.3.1
96
+ shellingham==1.5.4
97
+ rpds-py==0.23.1
98
+ pyzmq==26.3.0
99
+ python-multipart==0.0.20
100
+ python-dotenv==1.0.1
101
+ pycountry==24.6.1
102
+ pybind11==2.13.6
103
+ prometheus_client==0.21.1
104
+ pluggy==1.5.0
105
+ pillow==11.1.0
106
+ partial-json-parser==0.2.1.1.post5
107
+ numpy==1.26.4
108
+ nest-asyncio==1.6.0
109
+ msgspec==0.19.0
110
+ llvmlite==0.43.0
111
+ lark==1.2.2
112
+ jiter==0.9.0
113
+ interegular==0.3.3
114
+ iniconfig==2.0.0
115
+ httptools==0.6.4
116
+ h11==0.14.0
117
+ einops==0.8.1
118
+ dnspython==2.7.0
119
+ distro==1.9.0
120
+ diskcache==5.6.3
121
+ cloudpickle==3.1.1
122
+ astor==0.8.1
123
+ airportsdata==20250224
124
+ uvicorn==0.34.0
125
+ tiktoken==0.9.0
126
+ referencing==0.36.2
127
+ pytest==8.3.5
128
+ opencv-python-headless==4.11.0.86
129
+ numba==0.60.0
130
+ importlib_metadata==8.6.1
131
+ httpcore==1.0.7
132
+ gguf==0.10.0
133
+ email_validator==2.2.0
134
+ depyf==0.18.0
135
+ cupy-cuda12x==13.4.0
136
+ anyio==4.8.0
137
+ watchfiles==1.0.4
138
+ typer==0.15.2
139
+ torch==2.5.1
140
+ starlette==0.46.1
141
+ rich-toolkit==0.13.2
142
+ lm-format-enforcer==0.10.11
143
+ jsonschema-specifications==2024.10.1
144
+ httpx==0.28.1
145
+ xformers==0.0.28.post3
146
+ transformers==4.49.0
147
+ torchvision==0.20.1
148
+ torchaudio==2.5.1
149
+ prometheus-fastapi-instrumentator==7.0.2
150
+ openai==1.66.3
151
+ jsonschema==4.23.0
152
+ fastapi==0.115.11
153
+ xgrammar==0.1.11
154
+ ray==2.40.0
155
+ outlines_core==0.1.26
156
+ mistral_common==1.5.3
157
+ fastapi-cli==0.0.7
158
+ compressed-tensors==0.9.1
159
+ outlines==0.1.11
160
+ vllm==0.7.3
161
+ antlr4-python3-runtime==4.9.3
162
+ omegaconf==2.3.0
163
+ hydra-core==1.3.2
164
+ rootutils==1.0.7
wandb/run-20250402_145246-e1n3xkh6/files/wandb-metadata.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.5.0-45-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.11",
4
+ "startedAt": "2025-04-02T14:52:46.988352Z",
5
+ "args": [
6
+ "model=gpt2xl_1.5b",
7
+ "task=gsm8k"
8
+ ],
9
+ "program": "/mnt/dlabscratch1/amani/LLM-RL/src/sft_pretrain_and_pushtohub.py",
10
+ "codePath": "src/sft_pretrain_and_pushtohub.py",
11
+ "git": {
12
+ "remote": "https://github.com/aryol/LLM-RL.git",
13
+ "commit": "af916ff96a9a9f7ba10303eca8d36be0bbd89fc8"
14
+ },
15
+ "email": "[email protected]",
16
+ "root": "/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39",
17
+ "host": "job-60b06e05eaef-0-0",
18
+ "executable": "/dlabscratch1/amani/.conda/envs/LLM-RL/bin/python",
19
+ "codePathLocal": "src/sft_pretrain_and_pushtohub.py",
20
+ "cpu_count": 64,
21
+ "cpu_count_logical": 128,
22
+ "gpu": "NVIDIA A100-SXM4-80GB",
23
+ "gpu_count": 1,
24
+ "disk": {
25
+ "/": {
26
+ "total": "7679362727936",
27
+ "used": "4235631878144"
28
+ }
29
+ },
30
+ "memory": {
31
+ "total": "1081887248384"
32
+ },
33
+ "cpu": {
34
+ "count": 64,
35
+ "countLogical": 128
36
+ },
37
+ "gpu_nvidia": [
38
+ {
39
+ "name": "NVIDIA A100-SXM4-80GB",
40
+ "memoryTotal": "85899345920",
41
+ "cudaCores": 6912,
42
+ "architecture": "Ampere"
43
+ }
44
+ ],
45
+ "cudaVersion": "12.4"
46
+ }
wandb/run-20250402_145246-e1n3xkh6/logs/debug-core.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"time":"2025-04-02T14:52:46.462612938Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpw6oemm4w/port-738.txt","pid":738,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-04-02T14:52:46.466433118Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":738}
3
+ {"time":"2025-04-02T14:52:46.466813128Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":38271,"Zone":""}}
4
+ {"time":"2025-04-02T14:52:46.55291915Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:57336"}
5
+ {"time":"2025-04-02T14:52:46.99572319Z","level":"INFO","msg":"handleInformInit: received","streamId":"e1n3xkh6","id":"127.0.0.1:57336"}
6
+ {"time":"2025-04-02T14:52:47.119891001Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"e1n3xkh6","id":"127.0.0.1:57336"}
wandb/run-20250402_145246-e1n3xkh6/logs/debug-internal.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-02T14:52:46.998966444Z","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39/wandb/run-20250402_145246-e1n3xkh6/logs/debug-core.log"}
2
+ {"time":"2025-04-02T14:52:47.119833744Z","level":"INFO","msg":"created new stream","id":"e1n3xkh6"}
3
+ {"time":"2025-04-02T14:52:47.119882315Z","level":"INFO","msg":"stream: started","id":"e1n3xkh6"}
4
+ {"time":"2025-04-02T14:52:47.119921969Z","level":"INFO","msg":"handler: started","stream_id":"e1n3xkh6"}
5
+ {"time":"2025-04-02T14:52:47.119936867Z","level":"INFO","msg":"writer: Do: started","stream_id":"e1n3xkh6"}
6
+ {"time":"2025-04-02T14:52:47.120603401Z","level":"INFO","msg":"sender: started","stream_id":"e1n3xkh6"}
7
+ {"time":"2025-04-02T14:52:47.425038021Z","level":"INFO","msg":"Starting system monitor"}
wandb/run-20250402_145246-e1n3xkh6/logs/debug.log ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Configure stats pid to 738
3
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Loading settings from /dlabscratch1/amani/.config/wandb/settings
4
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Loading settings from /mnt/dlabscratch1/amani/LLM-RL/wandb/settings
5
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39/wandb/run-20250402_145246-e1n3xkh6/logs/debug.log
7
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39/wandb/run-20250402_145246-e1n3xkh6/logs/debug-internal.log
8
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():761] calling init triggers
9
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():784] starting backend
12
+ 2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-04-02 14:52:46,987 INFO MainThread:738 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-02 14:52:46,987 INFO MainThread:738 [wandb_init.py:init():798] backend started and connected
15
+ 2025-04-02 14:52:46,989 INFO MainThread:738 [wandb_init.py:init():891] updated telemetry
16
+ 2025-04-02 14:52:47,015 INFO MainThread:738 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-04-02 14:52:47,419 INFO MainThread:738 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-04-02 14:52:47,737 INFO MainThread:738 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-04-02 14:53:00,970 INFO MainThread:738 [wandb_run.py:_config_callback():1261] config_cb None None {'vocab_size': 50257, 'n_positions': 1024, 'n_embd': 1600, 'n_layer': 48, 'n_head': 25, 'n_inner': None, 'activation_function': 'gelu_new', 'resid_pdrop': 0.1, 'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'layer_norm_epsilon': 1e-05, 'initializer_range': 0.02, 'summary_type': 'cls_index', 'summary_use_proj': True, 'summary_activation': None, 'summary_first_dropout': 0.1, 'summary_proj_to_labels': True, 'scale_attn_weights': True, 'use_cache': True, 'scale_attn_by_inverse_layer_idx': False, 'reorder_and_upcast_attn': False, 'bos_token_id': 50256, 'eos_token_id': 50256, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['GPT2LMHeadModel'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'pad_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': {'text-generation': {'do_sample': True, 'max_length': 50}}, 'problem_type': None, '_name_or_path': 'openai-community/gpt2-xl', '_attn_implementation_autoset': True, 'transformers_version': '4.49.0', 'model_type': 'gpt2', 'n_ctx': 1024, 'output_past': True, 'output_dir': '/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39', 'overwrite_output_dir': False, 'do_train': 'true,', 'do_eval': 'true,', 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 10, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 10, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': True, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'model_init_kwargs': None, 'use_liger': False, 'dataset_text_field': 'text', 'dataset_kwargs': None, 'dataset_num_proc': None, 'max_seq_length': 1024, 'packing': False, 'eval_packing': None, 'dataset_batch_size': None, 'num_of_sequences': None, 'chars_per_token': '<CHARS_PER_TOKEN>'}
24
+ 2025-04-02 14:53:00,973 INFO MainThread:738 [wandb_config.py:__setitem__():154] config set model/num_parameters = 1557611200 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x734030d35250>>
25
+ 2025-04-02 14:53:00,973 INFO MainThread:738 [wandb_run.py:_config_callback():1261] config_cb model/num_parameters 1557611200 None
wandb/run-20250402_145246-e1n3xkh6/run-e1n3xkh6.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:868d387659ad2157b3ef2be17b045544e2cc5ec08c48443a8f782234db3a58e7
3
+ size 360448