End of training
Browse files- .gitattributes +1 -0
- .hydra/config.yaml +47 -0
- .hydra/hydra.yaml +159 -0
- .hydra/overrides.yaml +2 -0
- README.md +58 -0
- config.json +40 -0
- epoch1/config.json +40 -0
- epoch1/generation_config.json +6 -0
- epoch1/merges.txt +0 -0
- epoch1/model-00001-of-00002.safetensors +3 -0
- epoch1/model-00002-of-00002.safetensors +3 -0
- epoch1/model.safetensors.index.json +587 -0
- epoch1/special_tokens_map.json +6 -0
- epoch1/tokenizer.json +0 -0
- epoch1/tokenizer_config.json +22 -0
- epoch1/training_args.bin +3 -0
- epoch1/vocab.json +0 -0
- generation_config.json +6 -0
- merges.txt +0 -0
- model-00001-of-00002.safetensors +3 -0
- model-00002-of-00002.safetensors +3 -0
- model.safetensors.index.json +587 -0
- sft_pretrain_and_pushtohub.log +7 -0
- special_tokens_map.json +6 -0
- tokenizer.json +0 -0
- tokenizer_config.json +22 -0
- training_args.bin +3 -0
- vocab.json +0 -0
- wandb/debug-internal.log +7 -0
- wandb/debug.log +25 -0
- wandb/run-20250402_145246-e1n3xkh6/files/output.log +123 -0
- wandb/run-20250402_145246-e1n3xkh6/files/requirements.txt +164 -0
- wandb/run-20250402_145246-e1n3xkh6/files/wandb-metadata.json +46 -0
- wandb/run-20250402_145246-e1n3xkh6/logs/debug-core.log +6 -0
- wandb/run-20250402_145246-e1n3xkh6/logs/debug-internal.log +7 -0
- wandb/run-20250402_145246-e1n3xkh6/logs/debug.log +25 -0
- wandb/run-20250402_145246-e1n3xkh6/run-e1n3xkh6.wandb +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
wandb/run-20250402_145246-e1n3xkh6/run-e1n3xkh6.wandb filter=lfs diff=lfs merge=lfs -text
|
.hydra/config.yaml
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
log_dir: ${oc.env:PROJECT_ROOT}/logs/sft_pretrain_and_pushtohub
|
2 |
+
generate_prompt: src.utils.return_generate_prompt
|
3 |
+
wandb_config:
|
4 |
+
name: ${model.model_name}
|
5 |
+
project: sft_on_${task.task_name}
|
6 |
+
dir: ${log_dir}/${task.task_name}-${model.model_name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
7 |
+
notes: null
|
8 |
+
trainer_args:
|
9 |
+
_target_: trl.SFTConfig
|
10 |
+
per_device_train_batch_size: 8
|
11 |
+
per_device_eval_batch_size: 16
|
12 |
+
num_train_epochs: 10
|
13 |
+
logging_dir: ${log_dir}/${task.task_name}-${model.model_name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
14 |
+
push_to_hub: false
|
15 |
+
save_strategy: epoch
|
16 |
+
evaluation_strategy: epoch
|
17 |
+
batch_eval_metrics: true
|
18 |
+
do_train: true,
|
19 |
+
do_eval: true,
|
20 |
+
output_dir: ${.logging_dir}
|
21 |
+
report_to: wandb
|
22 |
+
logging_steps: 10
|
23 |
+
task:
|
24 |
+
dataset:
|
25 |
+
_target_: datasets.load_dataset
|
26 |
+
path: openai/gsm8k
|
27 |
+
name: main
|
28 |
+
prompt_key: question
|
29 |
+
target_key: answer
|
30 |
+
default_prompt: "1. Always present the final answer on the last line of your response\
|
31 |
+
\ in the format: #### <answer> Ensure that the answer is a single number. \n 2.\
|
32 |
+
\ End each sentence with a newline character ('\\n'). \n 3. Perform any calculations\
|
33 |
+
\ within a <<...>> block before outputing the result of this calculation."
|
34 |
+
extract_answer_from_dataset: src.task.gsm8k.ExtractAnswerFromDataset
|
35 |
+
task_name: gsm8k
|
36 |
+
reward_class:
|
37 |
+
_target_: src.task.gsm8k.GSM8KReward
|
38 |
+
LOG_FILE: ${trainer.args.output_dir}/completions.json
|
39 |
+
format_reward_function: src.task.gsm8k.FormatRewardFunction
|
40 |
+
model:
|
41 |
+
model_name_or_path: openai-community/gpt2-xl
|
42 |
+
model_name: gpt2-xl
|
43 |
+
model_config:
|
44 |
+
_target_: trl.ModelConfig
|
45 |
+
use_peft: false
|
46 |
+
dataset_wrapper:
|
47 |
+
_target_: src.utils.CurriculumDatasetWrapper
|
.hydra/hydra.yaml
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: ${log_dir}/${task.task_name}-${model.model_name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
4 |
+
sweep:
|
5 |
+
dir: ${log_dir}/${task.task_name}-${model.model_name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.mode=RUN
|
114 |
+
task:
|
115 |
+
- model=gpt2xl_1.5b
|
116 |
+
- task=gsm8k
|
117 |
+
job:
|
118 |
+
name: sft_pretrain_and_pushtohub
|
119 |
+
chdir: null
|
120 |
+
override_dirname: model=gpt2xl_1.5b,task=gsm8k
|
121 |
+
id: ???
|
122 |
+
num: ???
|
123 |
+
config_name: sft_train_and_pushtohub.yaml
|
124 |
+
env_set: {}
|
125 |
+
env_copy: []
|
126 |
+
config:
|
127 |
+
override_dirname:
|
128 |
+
kv_sep: '='
|
129 |
+
item_sep: ','
|
130 |
+
exclude_keys: []
|
131 |
+
runtime:
|
132 |
+
version: 1.3.2
|
133 |
+
version_base: '1.3'
|
134 |
+
cwd: /mnt/dlabscratch1/amani/LLM-RL
|
135 |
+
config_sources:
|
136 |
+
- path: hydra.conf
|
137 |
+
schema: pkg
|
138 |
+
provider: hydra
|
139 |
+
- path: /mnt/dlabscratch1/amani/LLM-RL/config
|
140 |
+
schema: file
|
141 |
+
provider: main
|
142 |
+
- path: ''
|
143 |
+
schema: structured
|
144 |
+
provider: schema
|
145 |
+
output_dir: /mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39
|
146 |
+
choices:
|
147 |
+
dataset_wrapper: default
|
148 |
+
model: gpt2xl_1.5b
|
149 |
+
task: gsm8k
|
150 |
+
hydra/env: default
|
151 |
+
hydra/callbacks: null
|
152 |
+
hydra/job_logging: default
|
153 |
+
hydra/hydra_logging: default
|
154 |
+
hydra/hydra_help: default
|
155 |
+
hydra/help: default
|
156 |
+
hydra/sweeper: basic
|
157 |
+
hydra/launcher: basic
|
158 |
+
hydra/output: default
|
159 |
+
verbose: false
|
.hydra/overrides.yaml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
- model=gpt2xl_1.5b
|
2 |
+
- task=gsm8k
|
README.md
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: openai-community/gpt2-xl
|
3 |
+
library_name: transformers
|
4 |
+
model_name: 'gpt2-xl-gsm8k-epoch1-acc0-1. Always '
|
5 |
+
tags:
|
6 |
+
- generated_from_trainer
|
7 |
+
- trl
|
8 |
+
- sft
|
9 |
+
licence: license
|
10 |
+
---
|
11 |
+
|
12 |
+
# Model Card for gpt2-xl-gsm8k-epoch1-acc0-1. Always
|
13 |
+
|
14 |
+
This model is a fine-tuned version of [openai-community/gpt2-xl](https://huggingface.co/openai-community/gpt2-xl).
|
15 |
+
It has been trained using [TRL](https://github.com/huggingface/trl).
|
16 |
+
|
17 |
+
## Quick start
|
18 |
+
|
19 |
+
```python
|
20 |
+
from transformers import pipeline
|
21 |
+
|
22 |
+
question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
|
23 |
+
generator = pipeline("text-generation", model="masani/2025-04-02_14-52-39", device="cuda")
|
24 |
+
output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
|
25 |
+
print(output["generated_text"])
|
26 |
+
```
|
27 |
+
|
28 |
+
## Training procedure
|
29 |
+
|
30 |
+
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/epfl-dlab/sft_on_gsm8k/runs/e1n3xkh6)
|
31 |
+
|
32 |
+
|
33 |
+
This model was trained with SFT.
|
34 |
+
|
35 |
+
### Framework versions
|
36 |
+
|
37 |
+
- TRL: 0.15.2
|
38 |
+
- Transformers: 4.49.0
|
39 |
+
- Pytorch: 2.5.1
|
40 |
+
- Datasets: 3.1.0
|
41 |
+
- Tokenizers: 0.21.1
|
42 |
+
|
43 |
+
## Citations
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
Cite TRL as:
|
48 |
+
|
49 |
+
```bibtex
|
50 |
+
@misc{vonwerra2022trl,
|
51 |
+
title = {{TRL: Transformer Reinforcement Learning}},
|
52 |
+
author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
|
53 |
+
year = 2020,
|
54 |
+
journal = {GitHub repository},
|
55 |
+
publisher = {GitHub},
|
56 |
+
howpublished = {\url{https://github.com/huggingface/trl}}
|
57 |
+
}
|
58 |
+
```
|
config.json
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "openai-community/gpt2-xl",
|
3 |
+
"activation_function": "gelu_new",
|
4 |
+
"architectures": [
|
5 |
+
"GPT2LMHeadModel"
|
6 |
+
],
|
7 |
+
"attn_pdrop": 0.1,
|
8 |
+
"bos_token_id": 50256,
|
9 |
+
"embd_pdrop": 0.1,
|
10 |
+
"eos_token_id": 50256,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"layer_norm_epsilon": 1e-05,
|
13 |
+
"model_type": "gpt2",
|
14 |
+
"n_ctx": 1024,
|
15 |
+
"n_embd": 1600,
|
16 |
+
"n_head": 25,
|
17 |
+
"n_inner": null,
|
18 |
+
"n_layer": 48,
|
19 |
+
"n_positions": 1024,
|
20 |
+
"output_past": true,
|
21 |
+
"reorder_and_upcast_attn": false,
|
22 |
+
"resid_pdrop": 0.1,
|
23 |
+
"scale_attn_by_inverse_layer_idx": false,
|
24 |
+
"scale_attn_weights": true,
|
25 |
+
"summary_activation": null,
|
26 |
+
"summary_first_dropout": 0.1,
|
27 |
+
"summary_proj_to_labels": true,
|
28 |
+
"summary_type": "cls_index",
|
29 |
+
"summary_use_proj": true,
|
30 |
+
"task_specific_params": {
|
31 |
+
"text-generation": {
|
32 |
+
"do_sample": true,
|
33 |
+
"max_length": 50
|
34 |
+
}
|
35 |
+
},
|
36 |
+
"torch_dtype": "float32",
|
37 |
+
"transformers_version": "4.49.0",
|
38 |
+
"use_cache": true,
|
39 |
+
"vocab_size": 50257
|
40 |
+
}
|
epoch1/config.json
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "openai-community/gpt2-xl",
|
3 |
+
"activation_function": "gelu_new",
|
4 |
+
"architectures": [
|
5 |
+
"GPT2LMHeadModel"
|
6 |
+
],
|
7 |
+
"attn_pdrop": 0.1,
|
8 |
+
"bos_token_id": 50256,
|
9 |
+
"embd_pdrop": 0.1,
|
10 |
+
"eos_token_id": 50256,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"layer_norm_epsilon": 1e-05,
|
13 |
+
"model_type": "gpt2",
|
14 |
+
"n_ctx": 1024,
|
15 |
+
"n_embd": 1600,
|
16 |
+
"n_head": 25,
|
17 |
+
"n_inner": null,
|
18 |
+
"n_layer": 48,
|
19 |
+
"n_positions": 1024,
|
20 |
+
"output_past": true,
|
21 |
+
"reorder_and_upcast_attn": false,
|
22 |
+
"resid_pdrop": 0.1,
|
23 |
+
"scale_attn_by_inverse_layer_idx": false,
|
24 |
+
"scale_attn_weights": true,
|
25 |
+
"summary_activation": null,
|
26 |
+
"summary_first_dropout": 0.1,
|
27 |
+
"summary_proj_to_labels": true,
|
28 |
+
"summary_type": "cls_index",
|
29 |
+
"summary_use_proj": true,
|
30 |
+
"task_specific_params": {
|
31 |
+
"text-generation": {
|
32 |
+
"do_sample": true,
|
33 |
+
"max_length": 50
|
34 |
+
}
|
35 |
+
},
|
36 |
+
"torch_dtype": "float32",
|
37 |
+
"transformers_version": "4.49.0",
|
38 |
+
"use_cache": true,
|
39 |
+
"vocab_size": 50257
|
40 |
+
}
|
epoch1/generation_config.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 50256,
|
4 |
+
"eos_token_id": 50256,
|
5 |
+
"transformers_version": "4.49.0"
|
6 |
+
}
|
epoch1/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
epoch1/model-00001-of-00002.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45eaf1bc5dcfd0b4839330d6f467f69caab9ee368654c5c6222c59406e7cd79a
|
3 |
+
size 4959881464
|
epoch1/model-00002-of-00002.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0918a2b866bcf0560ffd7519aed3b03d19320f777c8b7792408b6e5d65c5da2b
|
3 |
+
size 1270624096
|
epoch1/model.safetensors.index.json
ADDED
@@ -0,0 +1,587 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"total_size": 6230444800
|
4 |
+
},
|
5 |
+
"weight_map": {
|
6 |
+
"transformer.h.0.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
7 |
+
"transformer.h.0.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
8 |
+
"transformer.h.0.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
9 |
+
"transformer.h.0.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
10 |
+
"transformer.h.0.ln_1.bias": "model-00001-of-00002.safetensors",
|
11 |
+
"transformer.h.0.ln_1.weight": "model-00001-of-00002.safetensors",
|
12 |
+
"transformer.h.0.ln_2.bias": "model-00001-of-00002.safetensors",
|
13 |
+
"transformer.h.0.ln_2.weight": "model-00001-of-00002.safetensors",
|
14 |
+
"transformer.h.0.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
15 |
+
"transformer.h.0.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
16 |
+
"transformer.h.0.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
17 |
+
"transformer.h.0.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
18 |
+
"transformer.h.1.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
19 |
+
"transformer.h.1.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
20 |
+
"transformer.h.1.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
21 |
+
"transformer.h.1.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
22 |
+
"transformer.h.1.ln_1.bias": "model-00001-of-00002.safetensors",
|
23 |
+
"transformer.h.1.ln_1.weight": "model-00001-of-00002.safetensors",
|
24 |
+
"transformer.h.1.ln_2.bias": "model-00001-of-00002.safetensors",
|
25 |
+
"transformer.h.1.ln_2.weight": "model-00001-of-00002.safetensors",
|
26 |
+
"transformer.h.1.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
27 |
+
"transformer.h.1.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
28 |
+
"transformer.h.1.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
29 |
+
"transformer.h.1.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
30 |
+
"transformer.h.10.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
31 |
+
"transformer.h.10.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
32 |
+
"transformer.h.10.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
33 |
+
"transformer.h.10.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
34 |
+
"transformer.h.10.ln_1.bias": "model-00001-of-00002.safetensors",
|
35 |
+
"transformer.h.10.ln_1.weight": "model-00001-of-00002.safetensors",
|
36 |
+
"transformer.h.10.ln_2.bias": "model-00001-of-00002.safetensors",
|
37 |
+
"transformer.h.10.ln_2.weight": "model-00001-of-00002.safetensors",
|
38 |
+
"transformer.h.10.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
39 |
+
"transformer.h.10.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
40 |
+
"transformer.h.10.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
41 |
+
"transformer.h.10.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
42 |
+
"transformer.h.11.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
43 |
+
"transformer.h.11.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
44 |
+
"transformer.h.11.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
45 |
+
"transformer.h.11.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
46 |
+
"transformer.h.11.ln_1.bias": "model-00001-of-00002.safetensors",
|
47 |
+
"transformer.h.11.ln_1.weight": "model-00001-of-00002.safetensors",
|
48 |
+
"transformer.h.11.ln_2.bias": "model-00001-of-00002.safetensors",
|
49 |
+
"transformer.h.11.ln_2.weight": "model-00001-of-00002.safetensors",
|
50 |
+
"transformer.h.11.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
51 |
+
"transformer.h.11.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
52 |
+
"transformer.h.11.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
53 |
+
"transformer.h.11.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
54 |
+
"transformer.h.12.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
55 |
+
"transformer.h.12.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
56 |
+
"transformer.h.12.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
57 |
+
"transformer.h.12.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
58 |
+
"transformer.h.12.ln_1.bias": "model-00001-of-00002.safetensors",
|
59 |
+
"transformer.h.12.ln_1.weight": "model-00001-of-00002.safetensors",
|
60 |
+
"transformer.h.12.ln_2.bias": "model-00001-of-00002.safetensors",
|
61 |
+
"transformer.h.12.ln_2.weight": "model-00001-of-00002.safetensors",
|
62 |
+
"transformer.h.12.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
63 |
+
"transformer.h.12.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
64 |
+
"transformer.h.12.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
65 |
+
"transformer.h.12.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
66 |
+
"transformer.h.13.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
67 |
+
"transformer.h.13.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
68 |
+
"transformer.h.13.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
69 |
+
"transformer.h.13.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
70 |
+
"transformer.h.13.ln_1.bias": "model-00001-of-00002.safetensors",
|
71 |
+
"transformer.h.13.ln_1.weight": "model-00001-of-00002.safetensors",
|
72 |
+
"transformer.h.13.ln_2.bias": "model-00001-of-00002.safetensors",
|
73 |
+
"transformer.h.13.ln_2.weight": "model-00001-of-00002.safetensors",
|
74 |
+
"transformer.h.13.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
75 |
+
"transformer.h.13.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
76 |
+
"transformer.h.13.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
77 |
+
"transformer.h.13.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
78 |
+
"transformer.h.14.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
79 |
+
"transformer.h.14.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
80 |
+
"transformer.h.14.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
81 |
+
"transformer.h.14.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
82 |
+
"transformer.h.14.ln_1.bias": "model-00001-of-00002.safetensors",
|
83 |
+
"transformer.h.14.ln_1.weight": "model-00001-of-00002.safetensors",
|
84 |
+
"transformer.h.14.ln_2.bias": "model-00001-of-00002.safetensors",
|
85 |
+
"transformer.h.14.ln_2.weight": "model-00001-of-00002.safetensors",
|
86 |
+
"transformer.h.14.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
87 |
+
"transformer.h.14.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
88 |
+
"transformer.h.14.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
89 |
+
"transformer.h.14.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
90 |
+
"transformer.h.15.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
91 |
+
"transformer.h.15.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
92 |
+
"transformer.h.15.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
93 |
+
"transformer.h.15.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
94 |
+
"transformer.h.15.ln_1.bias": "model-00001-of-00002.safetensors",
|
95 |
+
"transformer.h.15.ln_1.weight": "model-00001-of-00002.safetensors",
|
96 |
+
"transformer.h.15.ln_2.bias": "model-00001-of-00002.safetensors",
|
97 |
+
"transformer.h.15.ln_2.weight": "model-00001-of-00002.safetensors",
|
98 |
+
"transformer.h.15.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
99 |
+
"transformer.h.15.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
100 |
+
"transformer.h.15.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
101 |
+
"transformer.h.15.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
102 |
+
"transformer.h.16.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
103 |
+
"transformer.h.16.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
104 |
+
"transformer.h.16.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
105 |
+
"transformer.h.16.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
106 |
+
"transformer.h.16.ln_1.bias": "model-00001-of-00002.safetensors",
|
107 |
+
"transformer.h.16.ln_1.weight": "model-00001-of-00002.safetensors",
|
108 |
+
"transformer.h.16.ln_2.bias": "model-00001-of-00002.safetensors",
|
109 |
+
"transformer.h.16.ln_2.weight": "model-00001-of-00002.safetensors",
|
110 |
+
"transformer.h.16.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
111 |
+
"transformer.h.16.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
112 |
+
"transformer.h.16.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
113 |
+
"transformer.h.16.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
114 |
+
"transformer.h.17.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
115 |
+
"transformer.h.17.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
116 |
+
"transformer.h.17.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
117 |
+
"transformer.h.17.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
118 |
+
"transformer.h.17.ln_1.bias": "model-00001-of-00002.safetensors",
|
119 |
+
"transformer.h.17.ln_1.weight": "model-00001-of-00002.safetensors",
|
120 |
+
"transformer.h.17.ln_2.bias": "model-00001-of-00002.safetensors",
|
121 |
+
"transformer.h.17.ln_2.weight": "model-00001-of-00002.safetensors",
|
122 |
+
"transformer.h.17.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
123 |
+
"transformer.h.17.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
124 |
+
"transformer.h.17.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
125 |
+
"transformer.h.17.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
126 |
+
"transformer.h.18.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
127 |
+
"transformer.h.18.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
128 |
+
"transformer.h.18.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
129 |
+
"transformer.h.18.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
130 |
+
"transformer.h.18.ln_1.bias": "model-00001-of-00002.safetensors",
|
131 |
+
"transformer.h.18.ln_1.weight": "model-00001-of-00002.safetensors",
|
132 |
+
"transformer.h.18.ln_2.bias": "model-00001-of-00002.safetensors",
|
133 |
+
"transformer.h.18.ln_2.weight": "model-00001-of-00002.safetensors",
|
134 |
+
"transformer.h.18.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
135 |
+
"transformer.h.18.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
136 |
+
"transformer.h.18.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
137 |
+
"transformer.h.18.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
138 |
+
"transformer.h.19.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
139 |
+
"transformer.h.19.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
140 |
+
"transformer.h.19.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
141 |
+
"transformer.h.19.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
142 |
+
"transformer.h.19.ln_1.bias": "model-00001-of-00002.safetensors",
|
143 |
+
"transformer.h.19.ln_1.weight": "model-00001-of-00002.safetensors",
|
144 |
+
"transformer.h.19.ln_2.bias": "model-00001-of-00002.safetensors",
|
145 |
+
"transformer.h.19.ln_2.weight": "model-00001-of-00002.safetensors",
|
146 |
+
"transformer.h.19.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
147 |
+
"transformer.h.19.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
148 |
+
"transformer.h.19.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
149 |
+
"transformer.h.19.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
150 |
+
"transformer.h.2.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
151 |
+
"transformer.h.2.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
152 |
+
"transformer.h.2.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
153 |
+
"transformer.h.2.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
154 |
+
"transformer.h.2.ln_1.bias": "model-00001-of-00002.safetensors",
|
155 |
+
"transformer.h.2.ln_1.weight": "model-00001-of-00002.safetensors",
|
156 |
+
"transformer.h.2.ln_2.bias": "model-00001-of-00002.safetensors",
|
157 |
+
"transformer.h.2.ln_2.weight": "model-00001-of-00002.safetensors",
|
158 |
+
"transformer.h.2.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
159 |
+
"transformer.h.2.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
160 |
+
"transformer.h.2.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
161 |
+
"transformer.h.2.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
162 |
+
"transformer.h.20.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
163 |
+
"transformer.h.20.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
164 |
+
"transformer.h.20.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
165 |
+
"transformer.h.20.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
166 |
+
"transformer.h.20.ln_1.bias": "model-00001-of-00002.safetensors",
|
167 |
+
"transformer.h.20.ln_1.weight": "model-00001-of-00002.safetensors",
|
168 |
+
"transformer.h.20.ln_2.bias": "model-00001-of-00002.safetensors",
|
169 |
+
"transformer.h.20.ln_2.weight": "model-00001-of-00002.safetensors",
|
170 |
+
"transformer.h.20.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
171 |
+
"transformer.h.20.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
172 |
+
"transformer.h.20.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
173 |
+
"transformer.h.20.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
174 |
+
"transformer.h.21.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
175 |
+
"transformer.h.21.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
176 |
+
"transformer.h.21.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
177 |
+
"transformer.h.21.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
178 |
+
"transformer.h.21.ln_1.bias": "model-00001-of-00002.safetensors",
|
179 |
+
"transformer.h.21.ln_1.weight": "model-00001-of-00002.safetensors",
|
180 |
+
"transformer.h.21.ln_2.bias": "model-00001-of-00002.safetensors",
|
181 |
+
"transformer.h.21.ln_2.weight": "model-00001-of-00002.safetensors",
|
182 |
+
"transformer.h.21.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
183 |
+
"transformer.h.21.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
184 |
+
"transformer.h.21.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
185 |
+
"transformer.h.21.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
186 |
+
"transformer.h.22.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
187 |
+
"transformer.h.22.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
188 |
+
"transformer.h.22.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
189 |
+
"transformer.h.22.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
190 |
+
"transformer.h.22.ln_1.bias": "model-00001-of-00002.safetensors",
|
191 |
+
"transformer.h.22.ln_1.weight": "model-00001-of-00002.safetensors",
|
192 |
+
"transformer.h.22.ln_2.bias": "model-00001-of-00002.safetensors",
|
193 |
+
"transformer.h.22.ln_2.weight": "model-00001-of-00002.safetensors",
|
194 |
+
"transformer.h.22.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
195 |
+
"transformer.h.22.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
196 |
+
"transformer.h.22.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
197 |
+
"transformer.h.22.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
198 |
+
"transformer.h.23.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
199 |
+
"transformer.h.23.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
200 |
+
"transformer.h.23.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
201 |
+
"transformer.h.23.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
202 |
+
"transformer.h.23.ln_1.bias": "model-00001-of-00002.safetensors",
|
203 |
+
"transformer.h.23.ln_1.weight": "model-00001-of-00002.safetensors",
|
204 |
+
"transformer.h.23.ln_2.bias": "model-00001-of-00002.safetensors",
|
205 |
+
"transformer.h.23.ln_2.weight": "model-00001-of-00002.safetensors",
|
206 |
+
"transformer.h.23.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
207 |
+
"transformer.h.23.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
208 |
+
"transformer.h.23.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
209 |
+
"transformer.h.23.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
210 |
+
"transformer.h.24.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
211 |
+
"transformer.h.24.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
212 |
+
"transformer.h.24.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
213 |
+
"transformer.h.24.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
214 |
+
"transformer.h.24.ln_1.bias": "model-00001-of-00002.safetensors",
|
215 |
+
"transformer.h.24.ln_1.weight": "model-00001-of-00002.safetensors",
|
216 |
+
"transformer.h.24.ln_2.bias": "model-00001-of-00002.safetensors",
|
217 |
+
"transformer.h.24.ln_2.weight": "model-00001-of-00002.safetensors",
|
218 |
+
"transformer.h.24.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
219 |
+
"transformer.h.24.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
220 |
+
"transformer.h.24.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
221 |
+
"transformer.h.24.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
222 |
+
"transformer.h.25.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
223 |
+
"transformer.h.25.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
224 |
+
"transformer.h.25.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
225 |
+
"transformer.h.25.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
226 |
+
"transformer.h.25.ln_1.bias": "model-00001-of-00002.safetensors",
|
227 |
+
"transformer.h.25.ln_1.weight": "model-00001-of-00002.safetensors",
|
228 |
+
"transformer.h.25.ln_2.bias": "model-00001-of-00002.safetensors",
|
229 |
+
"transformer.h.25.ln_2.weight": "model-00001-of-00002.safetensors",
|
230 |
+
"transformer.h.25.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
231 |
+
"transformer.h.25.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
232 |
+
"transformer.h.25.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
233 |
+
"transformer.h.25.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
234 |
+
"transformer.h.26.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
235 |
+
"transformer.h.26.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
236 |
+
"transformer.h.26.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
237 |
+
"transformer.h.26.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
238 |
+
"transformer.h.26.ln_1.bias": "model-00001-of-00002.safetensors",
|
239 |
+
"transformer.h.26.ln_1.weight": "model-00001-of-00002.safetensors",
|
240 |
+
"transformer.h.26.ln_2.bias": "model-00001-of-00002.safetensors",
|
241 |
+
"transformer.h.26.ln_2.weight": "model-00001-of-00002.safetensors",
|
242 |
+
"transformer.h.26.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
243 |
+
"transformer.h.26.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
244 |
+
"transformer.h.26.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
245 |
+
"transformer.h.26.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
246 |
+
"transformer.h.27.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
247 |
+
"transformer.h.27.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
248 |
+
"transformer.h.27.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
249 |
+
"transformer.h.27.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
250 |
+
"transformer.h.27.ln_1.bias": "model-00001-of-00002.safetensors",
|
251 |
+
"transformer.h.27.ln_1.weight": "model-00001-of-00002.safetensors",
|
252 |
+
"transformer.h.27.ln_2.bias": "model-00001-of-00002.safetensors",
|
253 |
+
"transformer.h.27.ln_2.weight": "model-00001-of-00002.safetensors",
|
254 |
+
"transformer.h.27.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
255 |
+
"transformer.h.27.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
256 |
+
"transformer.h.27.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
257 |
+
"transformer.h.27.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
258 |
+
"transformer.h.28.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
259 |
+
"transformer.h.28.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
260 |
+
"transformer.h.28.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
261 |
+
"transformer.h.28.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
262 |
+
"transformer.h.28.ln_1.bias": "model-00001-of-00002.safetensors",
|
263 |
+
"transformer.h.28.ln_1.weight": "model-00001-of-00002.safetensors",
|
264 |
+
"transformer.h.28.ln_2.bias": "model-00001-of-00002.safetensors",
|
265 |
+
"transformer.h.28.ln_2.weight": "model-00001-of-00002.safetensors",
|
266 |
+
"transformer.h.28.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
267 |
+
"transformer.h.28.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
268 |
+
"transformer.h.28.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
269 |
+
"transformer.h.28.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
270 |
+
"transformer.h.29.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
271 |
+
"transformer.h.29.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
272 |
+
"transformer.h.29.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
273 |
+
"transformer.h.29.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
274 |
+
"transformer.h.29.ln_1.bias": "model-00001-of-00002.safetensors",
|
275 |
+
"transformer.h.29.ln_1.weight": "model-00001-of-00002.safetensors",
|
276 |
+
"transformer.h.29.ln_2.bias": "model-00001-of-00002.safetensors",
|
277 |
+
"transformer.h.29.ln_2.weight": "model-00001-of-00002.safetensors",
|
278 |
+
"transformer.h.29.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
279 |
+
"transformer.h.29.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
280 |
+
"transformer.h.29.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
281 |
+
"transformer.h.29.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
282 |
+
"transformer.h.3.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
283 |
+
"transformer.h.3.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
284 |
+
"transformer.h.3.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
285 |
+
"transformer.h.3.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
286 |
+
"transformer.h.3.ln_1.bias": "model-00001-of-00002.safetensors",
|
287 |
+
"transformer.h.3.ln_1.weight": "model-00001-of-00002.safetensors",
|
288 |
+
"transformer.h.3.ln_2.bias": "model-00001-of-00002.safetensors",
|
289 |
+
"transformer.h.3.ln_2.weight": "model-00001-of-00002.safetensors",
|
290 |
+
"transformer.h.3.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
291 |
+
"transformer.h.3.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
292 |
+
"transformer.h.3.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
293 |
+
"transformer.h.3.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
294 |
+
"transformer.h.30.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
295 |
+
"transformer.h.30.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
296 |
+
"transformer.h.30.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
297 |
+
"transformer.h.30.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
298 |
+
"transformer.h.30.ln_1.bias": "model-00001-of-00002.safetensors",
|
299 |
+
"transformer.h.30.ln_1.weight": "model-00001-of-00002.safetensors",
|
300 |
+
"transformer.h.30.ln_2.bias": "model-00001-of-00002.safetensors",
|
301 |
+
"transformer.h.30.ln_2.weight": "model-00001-of-00002.safetensors",
|
302 |
+
"transformer.h.30.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
303 |
+
"transformer.h.30.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
304 |
+
"transformer.h.30.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
305 |
+
"transformer.h.30.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
306 |
+
"transformer.h.31.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
307 |
+
"transformer.h.31.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
308 |
+
"transformer.h.31.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
309 |
+
"transformer.h.31.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
310 |
+
"transformer.h.31.ln_1.bias": "model-00001-of-00002.safetensors",
|
311 |
+
"transformer.h.31.ln_1.weight": "model-00001-of-00002.safetensors",
|
312 |
+
"transformer.h.31.ln_2.bias": "model-00001-of-00002.safetensors",
|
313 |
+
"transformer.h.31.ln_2.weight": "model-00001-of-00002.safetensors",
|
314 |
+
"transformer.h.31.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
315 |
+
"transformer.h.31.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
316 |
+
"transformer.h.31.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
317 |
+
"transformer.h.31.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
318 |
+
"transformer.h.32.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
319 |
+
"transformer.h.32.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
320 |
+
"transformer.h.32.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
321 |
+
"transformer.h.32.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
322 |
+
"transformer.h.32.ln_1.bias": "model-00001-of-00002.safetensors",
|
323 |
+
"transformer.h.32.ln_1.weight": "model-00001-of-00002.safetensors",
|
324 |
+
"transformer.h.32.ln_2.bias": "model-00001-of-00002.safetensors",
|
325 |
+
"transformer.h.32.ln_2.weight": "model-00001-of-00002.safetensors",
|
326 |
+
"transformer.h.32.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
327 |
+
"transformer.h.32.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
328 |
+
"transformer.h.32.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
329 |
+
"transformer.h.32.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
330 |
+
"transformer.h.33.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
331 |
+
"transformer.h.33.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
332 |
+
"transformer.h.33.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
333 |
+
"transformer.h.33.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
334 |
+
"transformer.h.33.ln_1.bias": "model-00001-of-00002.safetensors",
|
335 |
+
"transformer.h.33.ln_1.weight": "model-00001-of-00002.safetensors",
|
336 |
+
"transformer.h.33.ln_2.bias": "model-00001-of-00002.safetensors",
|
337 |
+
"transformer.h.33.ln_2.weight": "model-00001-of-00002.safetensors",
|
338 |
+
"transformer.h.33.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
339 |
+
"transformer.h.33.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
340 |
+
"transformer.h.33.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
341 |
+
"transformer.h.33.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
342 |
+
"transformer.h.34.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
343 |
+
"transformer.h.34.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
344 |
+
"transformer.h.34.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
345 |
+
"transformer.h.34.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
346 |
+
"transformer.h.34.ln_1.bias": "model-00001-of-00002.safetensors",
|
347 |
+
"transformer.h.34.ln_1.weight": "model-00001-of-00002.safetensors",
|
348 |
+
"transformer.h.34.ln_2.bias": "model-00001-of-00002.safetensors",
|
349 |
+
"transformer.h.34.ln_2.weight": "model-00001-of-00002.safetensors",
|
350 |
+
"transformer.h.34.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
351 |
+
"transformer.h.34.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
352 |
+
"transformer.h.34.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
353 |
+
"transformer.h.34.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
354 |
+
"transformer.h.35.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
355 |
+
"transformer.h.35.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
356 |
+
"transformer.h.35.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
357 |
+
"transformer.h.35.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
358 |
+
"transformer.h.35.ln_1.bias": "model-00001-of-00002.safetensors",
|
359 |
+
"transformer.h.35.ln_1.weight": "model-00001-of-00002.safetensors",
|
360 |
+
"transformer.h.35.ln_2.bias": "model-00001-of-00002.safetensors",
|
361 |
+
"transformer.h.35.ln_2.weight": "model-00001-of-00002.safetensors",
|
362 |
+
"transformer.h.35.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
363 |
+
"transformer.h.35.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
364 |
+
"transformer.h.35.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
365 |
+
"transformer.h.35.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
366 |
+
"transformer.h.36.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
367 |
+
"transformer.h.36.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
368 |
+
"transformer.h.36.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
369 |
+
"transformer.h.36.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
370 |
+
"transformer.h.36.ln_1.bias": "model-00001-of-00002.safetensors",
|
371 |
+
"transformer.h.36.ln_1.weight": "model-00001-of-00002.safetensors",
|
372 |
+
"transformer.h.36.ln_2.bias": "model-00001-of-00002.safetensors",
|
373 |
+
"transformer.h.36.ln_2.weight": "model-00001-of-00002.safetensors",
|
374 |
+
"transformer.h.36.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
375 |
+
"transformer.h.36.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
376 |
+
"transformer.h.36.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
377 |
+
"transformer.h.36.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
378 |
+
"transformer.h.37.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
379 |
+
"transformer.h.37.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
380 |
+
"transformer.h.37.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
381 |
+
"transformer.h.37.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
382 |
+
"transformer.h.37.ln_1.bias": "model-00001-of-00002.safetensors",
|
383 |
+
"transformer.h.37.ln_1.weight": "model-00001-of-00002.safetensors",
|
384 |
+
"transformer.h.37.ln_2.bias": "model-00001-of-00002.safetensors",
|
385 |
+
"transformer.h.37.ln_2.weight": "model-00001-of-00002.safetensors",
|
386 |
+
"transformer.h.37.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
387 |
+
"transformer.h.37.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
388 |
+
"transformer.h.37.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
389 |
+
"transformer.h.37.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
390 |
+
"transformer.h.38.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
391 |
+
"transformer.h.38.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
392 |
+
"transformer.h.38.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
393 |
+
"transformer.h.38.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
394 |
+
"transformer.h.38.ln_1.bias": "model-00002-of-00002.safetensors",
|
395 |
+
"transformer.h.38.ln_1.weight": "model-00002-of-00002.safetensors",
|
396 |
+
"transformer.h.38.ln_2.bias": "model-00002-of-00002.safetensors",
|
397 |
+
"transformer.h.38.ln_2.weight": "model-00002-of-00002.safetensors",
|
398 |
+
"transformer.h.38.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
399 |
+
"transformer.h.38.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
400 |
+
"transformer.h.38.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
401 |
+
"transformer.h.38.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
402 |
+
"transformer.h.39.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
403 |
+
"transformer.h.39.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
404 |
+
"transformer.h.39.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
405 |
+
"transformer.h.39.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
406 |
+
"transformer.h.39.ln_1.bias": "model-00002-of-00002.safetensors",
|
407 |
+
"transformer.h.39.ln_1.weight": "model-00002-of-00002.safetensors",
|
408 |
+
"transformer.h.39.ln_2.bias": "model-00002-of-00002.safetensors",
|
409 |
+
"transformer.h.39.ln_2.weight": "model-00002-of-00002.safetensors",
|
410 |
+
"transformer.h.39.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
411 |
+
"transformer.h.39.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
412 |
+
"transformer.h.39.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
413 |
+
"transformer.h.39.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
414 |
+
"transformer.h.4.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
415 |
+
"transformer.h.4.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
416 |
+
"transformer.h.4.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
417 |
+
"transformer.h.4.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
418 |
+
"transformer.h.4.ln_1.bias": "model-00001-of-00002.safetensors",
|
419 |
+
"transformer.h.4.ln_1.weight": "model-00001-of-00002.safetensors",
|
420 |
+
"transformer.h.4.ln_2.bias": "model-00001-of-00002.safetensors",
|
421 |
+
"transformer.h.4.ln_2.weight": "model-00001-of-00002.safetensors",
|
422 |
+
"transformer.h.4.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
423 |
+
"transformer.h.4.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
424 |
+
"transformer.h.4.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
425 |
+
"transformer.h.4.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
426 |
+
"transformer.h.40.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
427 |
+
"transformer.h.40.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
428 |
+
"transformer.h.40.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
429 |
+
"transformer.h.40.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
430 |
+
"transformer.h.40.ln_1.bias": "model-00002-of-00002.safetensors",
|
431 |
+
"transformer.h.40.ln_1.weight": "model-00002-of-00002.safetensors",
|
432 |
+
"transformer.h.40.ln_2.bias": "model-00002-of-00002.safetensors",
|
433 |
+
"transformer.h.40.ln_2.weight": "model-00002-of-00002.safetensors",
|
434 |
+
"transformer.h.40.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
435 |
+
"transformer.h.40.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
436 |
+
"transformer.h.40.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
437 |
+
"transformer.h.40.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
438 |
+
"transformer.h.41.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
439 |
+
"transformer.h.41.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
440 |
+
"transformer.h.41.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
441 |
+
"transformer.h.41.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
442 |
+
"transformer.h.41.ln_1.bias": "model-00002-of-00002.safetensors",
|
443 |
+
"transformer.h.41.ln_1.weight": "model-00002-of-00002.safetensors",
|
444 |
+
"transformer.h.41.ln_2.bias": "model-00002-of-00002.safetensors",
|
445 |
+
"transformer.h.41.ln_2.weight": "model-00002-of-00002.safetensors",
|
446 |
+
"transformer.h.41.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
447 |
+
"transformer.h.41.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
448 |
+
"transformer.h.41.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
449 |
+
"transformer.h.41.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
450 |
+
"transformer.h.42.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
451 |
+
"transformer.h.42.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
452 |
+
"transformer.h.42.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
453 |
+
"transformer.h.42.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
454 |
+
"transformer.h.42.ln_1.bias": "model-00002-of-00002.safetensors",
|
455 |
+
"transformer.h.42.ln_1.weight": "model-00002-of-00002.safetensors",
|
456 |
+
"transformer.h.42.ln_2.bias": "model-00002-of-00002.safetensors",
|
457 |
+
"transformer.h.42.ln_2.weight": "model-00002-of-00002.safetensors",
|
458 |
+
"transformer.h.42.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
459 |
+
"transformer.h.42.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
460 |
+
"transformer.h.42.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
461 |
+
"transformer.h.42.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
462 |
+
"transformer.h.43.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
463 |
+
"transformer.h.43.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
464 |
+
"transformer.h.43.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
465 |
+
"transformer.h.43.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
466 |
+
"transformer.h.43.ln_1.bias": "model-00002-of-00002.safetensors",
|
467 |
+
"transformer.h.43.ln_1.weight": "model-00002-of-00002.safetensors",
|
468 |
+
"transformer.h.43.ln_2.bias": "model-00002-of-00002.safetensors",
|
469 |
+
"transformer.h.43.ln_2.weight": "model-00002-of-00002.safetensors",
|
470 |
+
"transformer.h.43.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
471 |
+
"transformer.h.43.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
472 |
+
"transformer.h.43.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
473 |
+
"transformer.h.43.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
474 |
+
"transformer.h.44.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
475 |
+
"transformer.h.44.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
476 |
+
"transformer.h.44.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
477 |
+
"transformer.h.44.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
478 |
+
"transformer.h.44.ln_1.bias": "model-00002-of-00002.safetensors",
|
479 |
+
"transformer.h.44.ln_1.weight": "model-00002-of-00002.safetensors",
|
480 |
+
"transformer.h.44.ln_2.bias": "model-00002-of-00002.safetensors",
|
481 |
+
"transformer.h.44.ln_2.weight": "model-00002-of-00002.safetensors",
|
482 |
+
"transformer.h.44.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
483 |
+
"transformer.h.44.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
484 |
+
"transformer.h.44.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
485 |
+
"transformer.h.44.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
486 |
+
"transformer.h.45.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
487 |
+
"transformer.h.45.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
488 |
+
"transformer.h.45.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
489 |
+
"transformer.h.45.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
490 |
+
"transformer.h.45.ln_1.bias": "model-00002-of-00002.safetensors",
|
491 |
+
"transformer.h.45.ln_1.weight": "model-00002-of-00002.safetensors",
|
492 |
+
"transformer.h.45.ln_2.bias": "model-00002-of-00002.safetensors",
|
493 |
+
"transformer.h.45.ln_2.weight": "model-00002-of-00002.safetensors",
|
494 |
+
"transformer.h.45.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
495 |
+
"transformer.h.45.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
496 |
+
"transformer.h.45.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
497 |
+
"transformer.h.45.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
498 |
+
"transformer.h.46.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
499 |
+
"transformer.h.46.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
500 |
+
"transformer.h.46.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
501 |
+
"transformer.h.46.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
502 |
+
"transformer.h.46.ln_1.bias": "model-00002-of-00002.safetensors",
|
503 |
+
"transformer.h.46.ln_1.weight": "model-00002-of-00002.safetensors",
|
504 |
+
"transformer.h.46.ln_2.bias": "model-00002-of-00002.safetensors",
|
505 |
+
"transformer.h.46.ln_2.weight": "model-00002-of-00002.safetensors",
|
506 |
+
"transformer.h.46.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
507 |
+
"transformer.h.46.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
508 |
+
"transformer.h.46.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
509 |
+
"transformer.h.46.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
510 |
+
"transformer.h.47.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
511 |
+
"transformer.h.47.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
512 |
+
"transformer.h.47.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
513 |
+
"transformer.h.47.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
514 |
+
"transformer.h.47.ln_1.bias": "model-00002-of-00002.safetensors",
|
515 |
+
"transformer.h.47.ln_1.weight": "model-00002-of-00002.safetensors",
|
516 |
+
"transformer.h.47.ln_2.bias": "model-00002-of-00002.safetensors",
|
517 |
+
"transformer.h.47.ln_2.weight": "model-00002-of-00002.safetensors",
|
518 |
+
"transformer.h.47.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
519 |
+
"transformer.h.47.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
520 |
+
"transformer.h.47.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
521 |
+
"transformer.h.47.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
522 |
+
"transformer.h.5.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
523 |
+
"transformer.h.5.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
524 |
+
"transformer.h.5.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
525 |
+
"transformer.h.5.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
526 |
+
"transformer.h.5.ln_1.bias": "model-00001-of-00002.safetensors",
|
527 |
+
"transformer.h.5.ln_1.weight": "model-00001-of-00002.safetensors",
|
528 |
+
"transformer.h.5.ln_2.bias": "model-00001-of-00002.safetensors",
|
529 |
+
"transformer.h.5.ln_2.weight": "model-00001-of-00002.safetensors",
|
530 |
+
"transformer.h.5.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
531 |
+
"transformer.h.5.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
532 |
+
"transformer.h.5.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
533 |
+
"transformer.h.5.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
534 |
+
"transformer.h.6.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
535 |
+
"transformer.h.6.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
536 |
+
"transformer.h.6.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
537 |
+
"transformer.h.6.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
538 |
+
"transformer.h.6.ln_1.bias": "model-00001-of-00002.safetensors",
|
539 |
+
"transformer.h.6.ln_1.weight": "model-00001-of-00002.safetensors",
|
540 |
+
"transformer.h.6.ln_2.bias": "model-00001-of-00002.safetensors",
|
541 |
+
"transformer.h.6.ln_2.weight": "model-00001-of-00002.safetensors",
|
542 |
+
"transformer.h.6.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
543 |
+
"transformer.h.6.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
544 |
+
"transformer.h.6.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
545 |
+
"transformer.h.6.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
546 |
+
"transformer.h.7.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
547 |
+
"transformer.h.7.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
548 |
+
"transformer.h.7.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
549 |
+
"transformer.h.7.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
550 |
+
"transformer.h.7.ln_1.bias": "model-00001-of-00002.safetensors",
|
551 |
+
"transformer.h.7.ln_1.weight": "model-00001-of-00002.safetensors",
|
552 |
+
"transformer.h.7.ln_2.bias": "model-00001-of-00002.safetensors",
|
553 |
+
"transformer.h.7.ln_2.weight": "model-00001-of-00002.safetensors",
|
554 |
+
"transformer.h.7.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
555 |
+
"transformer.h.7.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
556 |
+
"transformer.h.7.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
557 |
+
"transformer.h.7.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
558 |
+
"transformer.h.8.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
559 |
+
"transformer.h.8.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
560 |
+
"transformer.h.8.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
561 |
+
"transformer.h.8.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
562 |
+
"transformer.h.8.ln_1.bias": "model-00001-of-00002.safetensors",
|
563 |
+
"transformer.h.8.ln_1.weight": "model-00001-of-00002.safetensors",
|
564 |
+
"transformer.h.8.ln_2.bias": "model-00001-of-00002.safetensors",
|
565 |
+
"transformer.h.8.ln_2.weight": "model-00001-of-00002.safetensors",
|
566 |
+
"transformer.h.8.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
567 |
+
"transformer.h.8.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
568 |
+
"transformer.h.8.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
569 |
+
"transformer.h.8.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
570 |
+
"transformer.h.9.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
571 |
+
"transformer.h.9.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
572 |
+
"transformer.h.9.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
573 |
+
"transformer.h.9.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
574 |
+
"transformer.h.9.ln_1.bias": "model-00001-of-00002.safetensors",
|
575 |
+
"transformer.h.9.ln_1.weight": "model-00001-of-00002.safetensors",
|
576 |
+
"transformer.h.9.ln_2.bias": "model-00001-of-00002.safetensors",
|
577 |
+
"transformer.h.9.ln_2.weight": "model-00001-of-00002.safetensors",
|
578 |
+
"transformer.h.9.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
579 |
+
"transformer.h.9.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
580 |
+
"transformer.h.9.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
581 |
+
"transformer.h.9.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
582 |
+
"transformer.ln_f.bias": "model-00002-of-00002.safetensors",
|
583 |
+
"transformer.ln_f.weight": "model-00002-of-00002.safetensors",
|
584 |
+
"transformer.wpe.weight": "model-00001-of-00002.safetensors",
|
585 |
+
"transformer.wte.weight": "model-00001-of-00002.safetensors"
|
586 |
+
}
|
587 |
+
}
|
epoch1/special_tokens_map.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<|endoftext|>",
|
3 |
+
"eos_token": "<|endoftext|>",
|
4 |
+
"pad_token": "<|endoftext|>",
|
5 |
+
"unk_token": "<|endoftext|>"
|
6 |
+
}
|
epoch1/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
epoch1/tokenizer_config.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"50256": {
|
5 |
+
"content": "<|endoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": true,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"bos_token": "<|endoftext|>",
|
14 |
+
"clean_up_tokenization_spaces": false,
|
15 |
+
"eos_token": "<|endoftext|>",
|
16 |
+
"extra_special_tokens": {},
|
17 |
+
"model_max_length": 1024,
|
18 |
+
"pad_token": "<|endoftext|>",
|
19 |
+
"padding_side": "left",
|
20 |
+
"tokenizer_class": "GPT2Tokenizer",
|
21 |
+
"unk_token": "<|endoftext|>"
|
22 |
+
}
|
epoch1/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f14f3c6d299bfb369a2106aab54c59a032e03c366e1d1fecdcf02f954b66a25b
|
3 |
+
size 5624
|
epoch1/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
generation_config.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 50256,
|
4 |
+
"eos_token_id": 50256,
|
5 |
+
"transformers_version": "4.49.0"
|
6 |
+
}
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model-00001-of-00002.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45eaf1bc5dcfd0b4839330d6f467f69caab9ee368654c5c6222c59406e7cd79a
|
3 |
+
size 4959881464
|
model-00002-of-00002.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0918a2b866bcf0560ffd7519aed3b03d19320f777c8b7792408b6e5d65c5da2b
|
3 |
+
size 1270624096
|
model.safetensors.index.json
ADDED
@@ -0,0 +1,587 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"total_size": 6230444800
|
4 |
+
},
|
5 |
+
"weight_map": {
|
6 |
+
"transformer.h.0.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
7 |
+
"transformer.h.0.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
8 |
+
"transformer.h.0.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
9 |
+
"transformer.h.0.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
10 |
+
"transformer.h.0.ln_1.bias": "model-00001-of-00002.safetensors",
|
11 |
+
"transformer.h.0.ln_1.weight": "model-00001-of-00002.safetensors",
|
12 |
+
"transformer.h.0.ln_2.bias": "model-00001-of-00002.safetensors",
|
13 |
+
"transformer.h.0.ln_2.weight": "model-00001-of-00002.safetensors",
|
14 |
+
"transformer.h.0.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
15 |
+
"transformer.h.0.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
16 |
+
"transformer.h.0.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
17 |
+
"transformer.h.0.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
18 |
+
"transformer.h.1.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
19 |
+
"transformer.h.1.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
20 |
+
"transformer.h.1.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
21 |
+
"transformer.h.1.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
22 |
+
"transformer.h.1.ln_1.bias": "model-00001-of-00002.safetensors",
|
23 |
+
"transformer.h.1.ln_1.weight": "model-00001-of-00002.safetensors",
|
24 |
+
"transformer.h.1.ln_2.bias": "model-00001-of-00002.safetensors",
|
25 |
+
"transformer.h.1.ln_2.weight": "model-00001-of-00002.safetensors",
|
26 |
+
"transformer.h.1.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
27 |
+
"transformer.h.1.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
28 |
+
"transformer.h.1.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
29 |
+
"transformer.h.1.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
30 |
+
"transformer.h.10.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
31 |
+
"transformer.h.10.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
32 |
+
"transformer.h.10.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
33 |
+
"transformer.h.10.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
34 |
+
"transformer.h.10.ln_1.bias": "model-00001-of-00002.safetensors",
|
35 |
+
"transformer.h.10.ln_1.weight": "model-00001-of-00002.safetensors",
|
36 |
+
"transformer.h.10.ln_2.bias": "model-00001-of-00002.safetensors",
|
37 |
+
"transformer.h.10.ln_2.weight": "model-00001-of-00002.safetensors",
|
38 |
+
"transformer.h.10.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
39 |
+
"transformer.h.10.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
40 |
+
"transformer.h.10.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
41 |
+
"transformer.h.10.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
42 |
+
"transformer.h.11.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
43 |
+
"transformer.h.11.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
44 |
+
"transformer.h.11.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
45 |
+
"transformer.h.11.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
46 |
+
"transformer.h.11.ln_1.bias": "model-00001-of-00002.safetensors",
|
47 |
+
"transformer.h.11.ln_1.weight": "model-00001-of-00002.safetensors",
|
48 |
+
"transformer.h.11.ln_2.bias": "model-00001-of-00002.safetensors",
|
49 |
+
"transformer.h.11.ln_2.weight": "model-00001-of-00002.safetensors",
|
50 |
+
"transformer.h.11.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
51 |
+
"transformer.h.11.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
52 |
+
"transformer.h.11.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
53 |
+
"transformer.h.11.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
54 |
+
"transformer.h.12.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
55 |
+
"transformer.h.12.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
56 |
+
"transformer.h.12.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
57 |
+
"transformer.h.12.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
58 |
+
"transformer.h.12.ln_1.bias": "model-00001-of-00002.safetensors",
|
59 |
+
"transformer.h.12.ln_1.weight": "model-00001-of-00002.safetensors",
|
60 |
+
"transformer.h.12.ln_2.bias": "model-00001-of-00002.safetensors",
|
61 |
+
"transformer.h.12.ln_2.weight": "model-00001-of-00002.safetensors",
|
62 |
+
"transformer.h.12.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
63 |
+
"transformer.h.12.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
64 |
+
"transformer.h.12.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
65 |
+
"transformer.h.12.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
66 |
+
"transformer.h.13.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
67 |
+
"transformer.h.13.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
68 |
+
"transformer.h.13.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
69 |
+
"transformer.h.13.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
70 |
+
"transformer.h.13.ln_1.bias": "model-00001-of-00002.safetensors",
|
71 |
+
"transformer.h.13.ln_1.weight": "model-00001-of-00002.safetensors",
|
72 |
+
"transformer.h.13.ln_2.bias": "model-00001-of-00002.safetensors",
|
73 |
+
"transformer.h.13.ln_2.weight": "model-00001-of-00002.safetensors",
|
74 |
+
"transformer.h.13.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
75 |
+
"transformer.h.13.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
76 |
+
"transformer.h.13.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
77 |
+
"transformer.h.13.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
78 |
+
"transformer.h.14.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
79 |
+
"transformer.h.14.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
80 |
+
"transformer.h.14.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
81 |
+
"transformer.h.14.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
82 |
+
"transformer.h.14.ln_1.bias": "model-00001-of-00002.safetensors",
|
83 |
+
"transformer.h.14.ln_1.weight": "model-00001-of-00002.safetensors",
|
84 |
+
"transformer.h.14.ln_2.bias": "model-00001-of-00002.safetensors",
|
85 |
+
"transformer.h.14.ln_2.weight": "model-00001-of-00002.safetensors",
|
86 |
+
"transformer.h.14.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
87 |
+
"transformer.h.14.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
88 |
+
"transformer.h.14.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
89 |
+
"transformer.h.14.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
90 |
+
"transformer.h.15.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
91 |
+
"transformer.h.15.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
92 |
+
"transformer.h.15.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
93 |
+
"transformer.h.15.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
94 |
+
"transformer.h.15.ln_1.bias": "model-00001-of-00002.safetensors",
|
95 |
+
"transformer.h.15.ln_1.weight": "model-00001-of-00002.safetensors",
|
96 |
+
"transformer.h.15.ln_2.bias": "model-00001-of-00002.safetensors",
|
97 |
+
"transformer.h.15.ln_2.weight": "model-00001-of-00002.safetensors",
|
98 |
+
"transformer.h.15.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
99 |
+
"transformer.h.15.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
100 |
+
"transformer.h.15.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
101 |
+
"transformer.h.15.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
102 |
+
"transformer.h.16.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
103 |
+
"transformer.h.16.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
104 |
+
"transformer.h.16.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
105 |
+
"transformer.h.16.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
106 |
+
"transformer.h.16.ln_1.bias": "model-00001-of-00002.safetensors",
|
107 |
+
"transformer.h.16.ln_1.weight": "model-00001-of-00002.safetensors",
|
108 |
+
"transformer.h.16.ln_2.bias": "model-00001-of-00002.safetensors",
|
109 |
+
"transformer.h.16.ln_2.weight": "model-00001-of-00002.safetensors",
|
110 |
+
"transformer.h.16.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
111 |
+
"transformer.h.16.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
112 |
+
"transformer.h.16.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
113 |
+
"transformer.h.16.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
114 |
+
"transformer.h.17.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
115 |
+
"transformer.h.17.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
116 |
+
"transformer.h.17.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
117 |
+
"transformer.h.17.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
118 |
+
"transformer.h.17.ln_1.bias": "model-00001-of-00002.safetensors",
|
119 |
+
"transformer.h.17.ln_1.weight": "model-00001-of-00002.safetensors",
|
120 |
+
"transformer.h.17.ln_2.bias": "model-00001-of-00002.safetensors",
|
121 |
+
"transformer.h.17.ln_2.weight": "model-00001-of-00002.safetensors",
|
122 |
+
"transformer.h.17.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
123 |
+
"transformer.h.17.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
124 |
+
"transformer.h.17.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
125 |
+
"transformer.h.17.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
126 |
+
"transformer.h.18.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
127 |
+
"transformer.h.18.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
128 |
+
"transformer.h.18.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
129 |
+
"transformer.h.18.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
130 |
+
"transformer.h.18.ln_1.bias": "model-00001-of-00002.safetensors",
|
131 |
+
"transformer.h.18.ln_1.weight": "model-00001-of-00002.safetensors",
|
132 |
+
"transformer.h.18.ln_2.bias": "model-00001-of-00002.safetensors",
|
133 |
+
"transformer.h.18.ln_2.weight": "model-00001-of-00002.safetensors",
|
134 |
+
"transformer.h.18.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
135 |
+
"transformer.h.18.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
136 |
+
"transformer.h.18.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
137 |
+
"transformer.h.18.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
138 |
+
"transformer.h.19.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
139 |
+
"transformer.h.19.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
140 |
+
"transformer.h.19.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
141 |
+
"transformer.h.19.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
142 |
+
"transformer.h.19.ln_1.bias": "model-00001-of-00002.safetensors",
|
143 |
+
"transformer.h.19.ln_1.weight": "model-00001-of-00002.safetensors",
|
144 |
+
"transformer.h.19.ln_2.bias": "model-00001-of-00002.safetensors",
|
145 |
+
"transformer.h.19.ln_2.weight": "model-00001-of-00002.safetensors",
|
146 |
+
"transformer.h.19.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
147 |
+
"transformer.h.19.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
148 |
+
"transformer.h.19.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
149 |
+
"transformer.h.19.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
150 |
+
"transformer.h.2.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
151 |
+
"transformer.h.2.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
152 |
+
"transformer.h.2.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
153 |
+
"transformer.h.2.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
154 |
+
"transformer.h.2.ln_1.bias": "model-00001-of-00002.safetensors",
|
155 |
+
"transformer.h.2.ln_1.weight": "model-00001-of-00002.safetensors",
|
156 |
+
"transformer.h.2.ln_2.bias": "model-00001-of-00002.safetensors",
|
157 |
+
"transformer.h.2.ln_2.weight": "model-00001-of-00002.safetensors",
|
158 |
+
"transformer.h.2.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
159 |
+
"transformer.h.2.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
160 |
+
"transformer.h.2.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
161 |
+
"transformer.h.2.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
162 |
+
"transformer.h.20.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
163 |
+
"transformer.h.20.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
164 |
+
"transformer.h.20.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
165 |
+
"transformer.h.20.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
166 |
+
"transformer.h.20.ln_1.bias": "model-00001-of-00002.safetensors",
|
167 |
+
"transformer.h.20.ln_1.weight": "model-00001-of-00002.safetensors",
|
168 |
+
"transformer.h.20.ln_2.bias": "model-00001-of-00002.safetensors",
|
169 |
+
"transformer.h.20.ln_2.weight": "model-00001-of-00002.safetensors",
|
170 |
+
"transformer.h.20.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
171 |
+
"transformer.h.20.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
172 |
+
"transformer.h.20.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
173 |
+
"transformer.h.20.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
174 |
+
"transformer.h.21.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
175 |
+
"transformer.h.21.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
176 |
+
"transformer.h.21.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
177 |
+
"transformer.h.21.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
178 |
+
"transformer.h.21.ln_1.bias": "model-00001-of-00002.safetensors",
|
179 |
+
"transformer.h.21.ln_1.weight": "model-00001-of-00002.safetensors",
|
180 |
+
"transformer.h.21.ln_2.bias": "model-00001-of-00002.safetensors",
|
181 |
+
"transformer.h.21.ln_2.weight": "model-00001-of-00002.safetensors",
|
182 |
+
"transformer.h.21.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
183 |
+
"transformer.h.21.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
184 |
+
"transformer.h.21.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
185 |
+
"transformer.h.21.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
186 |
+
"transformer.h.22.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
187 |
+
"transformer.h.22.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
188 |
+
"transformer.h.22.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
189 |
+
"transformer.h.22.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
190 |
+
"transformer.h.22.ln_1.bias": "model-00001-of-00002.safetensors",
|
191 |
+
"transformer.h.22.ln_1.weight": "model-00001-of-00002.safetensors",
|
192 |
+
"transformer.h.22.ln_2.bias": "model-00001-of-00002.safetensors",
|
193 |
+
"transformer.h.22.ln_2.weight": "model-00001-of-00002.safetensors",
|
194 |
+
"transformer.h.22.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
195 |
+
"transformer.h.22.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
196 |
+
"transformer.h.22.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
197 |
+
"transformer.h.22.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
198 |
+
"transformer.h.23.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
199 |
+
"transformer.h.23.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
200 |
+
"transformer.h.23.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
201 |
+
"transformer.h.23.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
202 |
+
"transformer.h.23.ln_1.bias": "model-00001-of-00002.safetensors",
|
203 |
+
"transformer.h.23.ln_1.weight": "model-00001-of-00002.safetensors",
|
204 |
+
"transformer.h.23.ln_2.bias": "model-00001-of-00002.safetensors",
|
205 |
+
"transformer.h.23.ln_2.weight": "model-00001-of-00002.safetensors",
|
206 |
+
"transformer.h.23.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
207 |
+
"transformer.h.23.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
208 |
+
"transformer.h.23.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
209 |
+
"transformer.h.23.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
210 |
+
"transformer.h.24.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
211 |
+
"transformer.h.24.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
212 |
+
"transformer.h.24.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
213 |
+
"transformer.h.24.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
214 |
+
"transformer.h.24.ln_1.bias": "model-00001-of-00002.safetensors",
|
215 |
+
"transformer.h.24.ln_1.weight": "model-00001-of-00002.safetensors",
|
216 |
+
"transformer.h.24.ln_2.bias": "model-00001-of-00002.safetensors",
|
217 |
+
"transformer.h.24.ln_2.weight": "model-00001-of-00002.safetensors",
|
218 |
+
"transformer.h.24.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
219 |
+
"transformer.h.24.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
220 |
+
"transformer.h.24.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
221 |
+
"transformer.h.24.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
222 |
+
"transformer.h.25.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
223 |
+
"transformer.h.25.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
224 |
+
"transformer.h.25.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
225 |
+
"transformer.h.25.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
226 |
+
"transformer.h.25.ln_1.bias": "model-00001-of-00002.safetensors",
|
227 |
+
"transformer.h.25.ln_1.weight": "model-00001-of-00002.safetensors",
|
228 |
+
"transformer.h.25.ln_2.bias": "model-00001-of-00002.safetensors",
|
229 |
+
"transformer.h.25.ln_2.weight": "model-00001-of-00002.safetensors",
|
230 |
+
"transformer.h.25.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
231 |
+
"transformer.h.25.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
232 |
+
"transformer.h.25.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
233 |
+
"transformer.h.25.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
234 |
+
"transformer.h.26.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
235 |
+
"transformer.h.26.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
236 |
+
"transformer.h.26.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
237 |
+
"transformer.h.26.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
238 |
+
"transformer.h.26.ln_1.bias": "model-00001-of-00002.safetensors",
|
239 |
+
"transformer.h.26.ln_1.weight": "model-00001-of-00002.safetensors",
|
240 |
+
"transformer.h.26.ln_2.bias": "model-00001-of-00002.safetensors",
|
241 |
+
"transformer.h.26.ln_2.weight": "model-00001-of-00002.safetensors",
|
242 |
+
"transformer.h.26.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
243 |
+
"transformer.h.26.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
244 |
+
"transformer.h.26.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
245 |
+
"transformer.h.26.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
246 |
+
"transformer.h.27.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
247 |
+
"transformer.h.27.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
248 |
+
"transformer.h.27.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
249 |
+
"transformer.h.27.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
250 |
+
"transformer.h.27.ln_1.bias": "model-00001-of-00002.safetensors",
|
251 |
+
"transformer.h.27.ln_1.weight": "model-00001-of-00002.safetensors",
|
252 |
+
"transformer.h.27.ln_2.bias": "model-00001-of-00002.safetensors",
|
253 |
+
"transformer.h.27.ln_2.weight": "model-00001-of-00002.safetensors",
|
254 |
+
"transformer.h.27.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
255 |
+
"transformer.h.27.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
256 |
+
"transformer.h.27.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
257 |
+
"transformer.h.27.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
258 |
+
"transformer.h.28.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
259 |
+
"transformer.h.28.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
260 |
+
"transformer.h.28.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
261 |
+
"transformer.h.28.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
262 |
+
"transformer.h.28.ln_1.bias": "model-00001-of-00002.safetensors",
|
263 |
+
"transformer.h.28.ln_1.weight": "model-00001-of-00002.safetensors",
|
264 |
+
"transformer.h.28.ln_2.bias": "model-00001-of-00002.safetensors",
|
265 |
+
"transformer.h.28.ln_2.weight": "model-00001-of-00002.safetensors",
|
266 |
+
"transformer.h.28.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
267 |
+
"transformer.h.28.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
268 |
+
"transformer.h.28.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
269 |
+
"transformer.h.28.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
270 |
+
"transformer.h.29.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
271 |
+
"transformer.h.29.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
272 |
+
"transformer.h.29.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
273 |
+
"transformer.h.29.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
274 |
+
"transformer.h.29.ln_1.bias": "model-00001-of-00002.safetensors",
|
275 |
+
"transformer.h.29.ln_1.weight": "model-00001-of-00002.safetensors",
|
276 |
+
"transformer.h.29.ln_2.bias": "model-00001-of-00002.safetensors",
|
277 |
+
"transformer.h.29.ln_2.weight": "model-00001-of-00002.safetensors",
|
278 |
+
"transformer.h.29.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
279 |
+
"transformer.h.29.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
280 |
+
"transformer.h.29.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
281 |
+
"transformer.h.29.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
282 |
+
"transformer.h.3.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
283 |
+
"transformer.h.3.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
284 |
+
"transformer.h.3.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
285 |
+
"transformer.h.3.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
286 |
+
"transformer.h.3.ln_1.bias": "model-00001-of-00002.safetensors",
|
287 |
+
"transformer.h.3.ln_1.weight": "model-00001-of-00002.safetensors",
|
288 |
+
"transformer.h.3.ln_2.bias": "model-00001-of-00002.safetensors",
|
289 |
+
"transformer.h.3.ln_2.weight": "model-00001-of-00002.safetensors",
|
290 |
+
"transformer.h.3.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
291 |
+
"transformer.h.3.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
292 |
+
"transformer.h.3.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
293 |
+
"transformer.h.3.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
294 |
+
"transformer.h.30.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
295 |
+
"transformer.h.30.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
296 |
+
"transformer.h.30.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
297 |
+
"transformer.h.30.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
298 |
+
"transformer.h.30.ln_1.bias": "model-00001-of-00002.safetensors",
|
299 |
+
"transformer.h.30.ln_1.weight": "model-00001-of-00002.safetensors",
|
300 |
+
"transformer.h.30.ln_2.bias": "model-00001-of-00002.safetensors",
|
301 |
+
"transformer.h.30.ln_2.weight": "model-00001-of-00002.safetensors",
|
302 |
+
"transformer.h.30.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
303 |
+
"transformer.h.30.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
304 |
+
"transformer.h.30.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
305 |
+
"transformer.h.30.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
306 |
+
"transformer.h.31.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
307 |
+
"transformer.h.31.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
308 |
+
"transformer.h.31.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
309 |
+
"transformer.h.31.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
310 |
+
"transformer.h.31.ln_1.bias": "model-00001-of-00002.safetensors",
|
311 |
+
"transformer.h.31.ln_1.weight": "model-00001-of-00002.safetensors",
|
312 |
+
"transformer.h.31.ln_2.bias": "model-00001-of-00002.safetensors",
|
313 |
+
"transformer.h.31.ln_2.weight": "model-00001-of-00002.safetensors",
|
314 |
+
"transformer.h.31.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
315 |
+
"transformer.h.31.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
316 |
+
"transformer.h.31.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
317 |
+
"transformer.h.31.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
318 |
+
"transformer.h.32.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
319 |
+
"transformer.h.32.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
320 |
+
"transformer.h.32.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
321 |
+
"transformer.h.32.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
322 |
+
"transformer.h.32.ln_1.bias": "model-00001-of-00002.safetensors",
|
323 |
+
"transformer.h.32.ln_1.weight": "model-00001-of-00002.safetensors",
|
324 |
+
"transformer.h.32.ln_2.bias": "model-00001-of-00002.safetensors",
|
325 |
+
"transformer.h.32.ln_2.weight": "model-00001-of-00002.safetensors",
|
326 |
+
"transformer.h.32.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
327 |
+
"transformer.h.32.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
328 |
+
"transformer.h.32.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
329 |
+
"transformer.h.32.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
330 |
+
"transformer.h.33.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
331 |
+
"transformer.h.33.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
332 |
+
"transformer.h.33.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
333 |
+
"transformer.h.33.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
334 |
+
"transformer.h.33.ln_1.bias": "model-00001-of-00002.safetensors",
|
335 |
+
"transformer.h.33.ln_1.weight": "model-00001-of-00002.safetensors",
|
336 |
+
"transformer.h.33.ln_2.bias": "model-00001-of-00002.safetensors",
|
337 |
+
"transformer.h.33.ln_2.weight": "model-00001-of-00002.safetensors",
|
338 |
+
"transformer.h.33.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
339 |
+
"transformer.h.33.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
340 |
+
"transformer.h.33.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
341 |
+
"transformer.h.33.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
342 |
+
"transformer.h.34.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
343 |
+
"transformer.h.34.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
344 |
+
"transformer.h.34.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
345 |
+
"transformer.h.34.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
346 |
+
"transformer.h.34.ln_1.bias": "model-00001-of-00002.safetensors",
|
347 |
+
"transformer.h.34.ln_1.weight": "model-00001-of-00002.safetensors",
|
348 |
+
"transformer.h.34.ln_2.bias": "model-00001-of-00002.safetensors",
|
349 |
+
"transformer.h.34.ln_2.weight": "model-00001-of-00002.safetensors",
|
350 |
+
"transformer.h.34.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
351 |
+
"transformer.h.34.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
352 |
+
"transformer.h.34.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
353 |
+
"transformer.h.34.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
354 |
+
"transformer.h.35.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
355 |
+
"transformer.h.35.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
356 |
+
"transformer.h.35.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
357 |
+
"transformer.h.35.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
358 |
+
"transformer.h.35.ln_1.bias": "model-00001-of-00002.safetensors",
|
359 |
+
"transformer.h.35.ln_1.weight": "model-00001-of-00002.safetensors",
|
360 |
+
"transformer.h.35.ln_2.bias": "model-00001-of-00002.safetensors",
|
361 |
+
"transformer.h.35.ln_2.weight": "model-00001-of-00002.safetensors",
|
362 |
+
"transformer.h.35.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
363 |
+
"transformer.h.35.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
364 |
+
"transformer.h.35.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
365 |
+
"transformer.h.35.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
366 |
+
"transformer.h.36.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
367 |
+
"transformer.h.36.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
368 |
+
"transformer.h.36.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
369 |
+
"transformer.h.36.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
370 |
+
"transformer.h.36.ln_1.bias": "model-00001-of-00002.safetensors",
|
371 |
+
"transformer.h.36.ln_1.weight": "model-00001-of-00002.safetensors",
|
372 |
+
"transformer.h.36.ln_2.bias": "model-00001-of-00002.safetensors",
|
373 |
+
"transformer.h.36.ln_2.weight": "model-00001-of-00002.safetensors",
|
374 |
+
"transformer.h.36.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
375 |
+
"transformer.h.36.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
376 |
+
"transformer.h.36.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
377 |
+
"transformer.h.36.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
378 |
+
"transformer.h.37.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
379 |
+
"transformer.h.37.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
380 |
+
"transformer.h.37.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
381 |
+
"transformer.h.37.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
382 |
+
"transformer.h.37.ln_1.bias": "model-00001-of-00002.safetensors",
|
383 |
+
"transformer.h.37.ln_1.weight": "model-00001-of-00002.safetensors",
|
384 |
+
"transformer.h.37.ln_2.bias": "model-00001-of-00002.safetensors",
|
385 |
+
"transformer.h.37.ln_2.weight": "model-00001-of-00002.safetensors",
|
386 |
+
"transformer.h.37.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
387 |
+
"transformer.h.37.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
388 |
+
"transformer.h.37.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
389 |
+
"transformer.h.37.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
390 |
+
"transformer.h.38.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
391 |
+
"transformer.h.38.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
392 |
+
"transformer.h.38.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
393 |
+
"transformer.h.38.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
394 |
+
"transformer.h.38.ln_1.bias": "model-00002-of-00002.safetensors",
|
395 |
+
"transformer.h.38.ln_1.weight": "model-00002-of-00002.safetensors",
|
396 |
+
"transformer.h.38.ln_2.bias": "model-00002-of-00002.safetensors",
|
397 |
+
"transformer.h.38.ln_2.weight": "model-00002-of-00002.safetensors",
|
398 |
+
"transformer.h.38.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
399 |
+
"transformer.h.38.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
400 |
+
"transformer.h.38.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
401 |
+
"transformer.h.38.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
402 |
+
"transformer.h.39.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
403 |
+
"transformer.h.39.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
404 |
+
"transformer.h.39.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
405 |
+
"transformer.h.39.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
406 |
+
"transformer.h.39.ln_1.bias": "model-00002-of-00002.safetensors",
|
407 |
+
"transformer.h.39.ln_1.weight": "model-00002-of-00002.safetensors",
|
408 |
+
"transformer.h.39.ln_2.bias": "model-00002-of-00002.safetensors",
|
409 |
+
"transformer.h.39.ln_2.weight": "model-00002-of-00002.safetensors",
|
410 |
+
"transformer.h.39.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
411 |
+
"transformer.h.39.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
412 |
+
"transformer.h.39.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
413 |
+
"transformer.h.39.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
414 |
+
"transformer.h.4.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
415 |
+
"transformer.h.4.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
416 |
+
"transformer.h.4.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
417 |
+
"transformer.h.4.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
418 |
+
"transformer.h.4.ln_1.bias": "model-00001-of-00002.safetensors",
|
419 |
+
"transformer.h.4.ln_1.weight": "model-00001-of-00002.safetensors",
|
420 |
+
"transformer.h.4.ln_2.bias": "model-00001-of-00002.safetensors",
|
421 |
+
"transformer.h.4.ln_2.weight": "model-00001-of-00002.safetensors",
|
422 |
+
"transformer.h.4.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
423 |
+
"transformer.h.4.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
424 |
+
"transformer.h.4.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
425 |
+
"transformer.h.4.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
426 |
+
"transformer.h.40.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
427 |
+
"transformer.h.40.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
428 |
+
"transformer.h.40.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
429 |
+
"transformer.h.40.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
430 |
+
"transformer.h.40.ln_1.bias": "model-00002-of-00002.safetensors",
|
431 |
+
"transformer.h.40.ln_1.weight": "model-00002-of-00002.safetensors",
|
432 |
+
"transformer.h.40.ln_2.bias": "model-00002-of-00002.safetensors",
|
433 |
+
"transformer.h.40.ln_2.weight": "model-00002-of-00002.safetensors",
|
434 |
+
"transformer.h.40.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
435 |
+
"transformer.h.40.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
436 |
+
"transformer.h.40.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
437 |
+
"transformer.h.40.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
438 |
+
"transformer.h.41.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
439 |
+
"transformer.h.41.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
440 |
+
"transformer.h.41.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
441 |
+
"transformer.h.41.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
442 |
+
"transformer.h.41.ln_1.bias": "model-00002-of-00002.safetensors",
|
443 |
+
"transformer.h.41.ln_1.weight": "model-00002-of-00002.safetensors",
|
444 |
+
"transformer.h.41.ln_2.bias": "model-00002-of-00002.safetensors",
|
445 |
+
"transformer.h.41.ln_2.weight": "model-00002-of-00002.safetensors",
|
446 |
+
"transformer.h.41.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
447 |
+
"transformer.h.41.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
448 |
+
"transformer.h.41.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
449 |
+
"transformer.h.41.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
450 |
+
"transformer.h.42.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
451 |
+
"transformer.h.42.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
452 |
+
"transformer.h.42.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
453 |
+
"transformer.h.42.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
454 |
+
"transformer.h.42.ln_1.bias": "model-00002-of-00002.safetensors",
|
455 |
+
"transformer.h.42.ln_1.weight": "model-00002-of-00002.safetensors",
|
456 |
+
"transformer.h.42.ln_2.bias": "model-00002-of-00002.safetensors",
|
457 |
+
"transformer.h.42.ln_2.weight": "model-00002-of-00002.safetensors",
|
458 |
+
"transformer.h.42.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
459 |
+
"transformer.h.42.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
460 |
+
"transformer.h.42.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
461 |
+
"transformer.h.42.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
462 |
+
"transformer.h.43.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
463 |
+
"transformer.h.43.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
464 |
+
"transformer.h.43.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
465 |
+
"transformer.h.43.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
466 |
+
"transformer.h.43.ln_1.bias": "model-00002-of-00002.safetensors",
|
467 |
+
"transformer.h.43.ln_1.weight": "model-00002-of-00002.safetensors",
|
468 |
+
"transformer.h.43.ln_2.bias": "model-00002-of-00002.safetensors",
|
469 |
+
"transformer.h.43.ln_2.weight": "model-00002-of-00002.safetensors",
|
470 |
+
"transformer.h.43.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
471 |
+
"transformer.h.43.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
472 |
+
"transformer.h.43.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
473 |
+
"transformer.h.43.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
474 |
+
"transformer.h.44.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
475 |
+
"transformer.h.44.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
476 |
+
"transformer.h.44.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
477 |
+
"transformer.h.44.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
478 |
+
"transformer.h.44.ln_1.bias": "model-00002-of-00002.safetensors",
|
479 |
+
"transformer.h.44.ln_1.weight": "model-00002-of-00002.safetensors",
|
480 |
+
"transformer.h.44.ln_2.bias": "model-00002-of-00002.safetensors",
|
481 |
+
"transformer.h.44.ln_2.weight": "model-00002-of-00002.safetensors",
|
482 |
+
"transformer.h.44.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
483 |
+
"transformer.h.44.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
484 |
+
"transformer.h.44.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
485 |
+
"transformer.h.44.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
486 |
+
"transformer.h.45.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
487 |
+
"transformer.h.45.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
488 |
+
"transformer.h.45.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
489 |
+
"transformer.h.45.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
490 |
+
"transformer.h.45.ln_1.bias": "model-00002-of-00002.safetensors",
|
491 |
+
"transformer.h.45.ln_1.weight": "model-00002-of-00002.safetensors",
|
492 |
+
"transformer.h.45.ln_2.bias": "model-00002-of-00002.safetensors",
|
493 |
+
"transformer.h.45.ln_2.weight": "model-00002-of-00002.safetensors",
|
494 |
+
"transformer.h.45.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
495 |
+
"transformer.h.45.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
496 |
+
"transformer.h.45.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
497 |
+
"transformer.h.45.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
498 |
+
"transformer.h.46.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
499 |
+
"transformer.h.46.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
500 |
+
"transformer.h.46.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
501 |
+
"transformer.h.46.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
502 |
+
"transformer.h.46.ln_1.bias": "model-00002-of-00002.safetensors",
|
503 |
+
"transformer.h.46.ln_1.weight": "model-00002-of-00002.safetensors",
|
504 |
+
"transformer.h.46.ln_2.bias": "model-00002-of-00002.safetensors",
|
505 |
+
"transformer.h.46.ln_2.weight": "model-00002-of-00002.safetensors",
|
506 |
+
"transformer.h.46.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
507 |
+
"transformer.h.46.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
508 |
+
"transformer.h.46.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
509 |
+
"transformer.h.46.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
510 |
+
"transformer.h.47.attn.c_attn.bias": "model-00002-of-00002.safetensors",
|
511 |
+
"transformer.h.47.attn.c_attn.weight": "model-00002-of-00002.safetensors",
|
512 |
+
"transformer.h.47.attn.c_proj.bias": "model-00002-of-00002.safetensors",
|
513 |
+
"transformer.h.47.attn.c_proj.weight": "model-00002-of-00002.safetensors",
|
514 |
+
"transformer.h.47.ln_1.bias": "model-00002-of-00002.safetensors",
|
515 |
+
"transformer.h.47.ln_1.weight": "model-00002-of-00002.safetensors",
|
516 |
+
"transformer.h.47.ln_2.bias": "model-00002-of-00002.safetensors",
|
517 |
+
"transformer.h.47.ln_2.weight": "model-00002-of-00002.safetensors",
|
518 |
+
"transformer.h.47.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
|
519 |
+
"transformer.h.47.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
|
520 |
+
"transformer.h.47.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
|
521 |
+
"transformer.h.47.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
|
522 |
+
"transformer.h.5.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
523 |
+
"transformer.h.5.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
524 |
+
"transformer.h.5.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
525 |
+
"transformer.h.5.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
526 |
+
"transformer.h.5.ln_1.bias": "model-00001-of-00002.safetensors",
|
527 |
+
"transformer.h.5.ln_1.weight": "model-00001-of-00002.safetensors",
|
528 |
+
"transformer.h.5.ln_2.bias": "model-00001-of-00002.safetensors",
|
529 |
+
"transformer.h.5.ln_2.weight": "model-00001-of-00002.safetensors",
|
530 |
+
"transformer.h.5.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
531 |
+
"transformer.h.5.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
532 |
+
"transformer.h.5.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
533 |
+
"transformer.h.5.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
534 |
+
"transformer.h.6.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
535 |
+
"transformer.h.6.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
536 |
+
"transformer.h.6.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
537 |
+
"transformer.h.6.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
538 |
+
"transformer.h.6.ln_1.bias": "model-00001-of-00002.safetensors",
|
539 |
+
"transformer.h.6.ln_1.weight": "model-00001-of-00002.safetensors",
|
540 |
+
"transformer.h.6.ln_2.bias": "model-00001-of-00002.safetensors",
|
541 |
+
"transformer.h.6.ln_2.weight": "model-00001-of-00002.safetensors",
|
542 |
+
"transformer.h.6.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
543 |
+
"transformer.h.6.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
544 |
+
"transformer.h.6.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
545 |
+
"transformer.h.6.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
546 |
+
"transformer.h.7.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
547 |
+
"transformer.h.7.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
548 |
+
"transformer.h.7.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
549 |
+
"transformer.h.7.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
550 |
+
"transformer.h.7.ln_1.bias": "model-00001-of-00002.safetensors",
|
551 |
+
"transformer.h.7.ln_1.weight": "model-00001-of-00002.safetensors",
|
552 |
+
"transformer.h.7.ln_2.bias": "model-00001-of-00002.safetensors",
|
553 |
+
"transformer.h.7.ln_2.weight": "model-00001-of-00002.safetensors",
|
554 |
+
"transformer.h.7.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
555 |
+
"transformer.h.7.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
556 |
+
"transformer.h.7.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
557 |
+
"transformer.h.7.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
558 |
+
"transformer.h.8.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
559 |
+
"transformer.h.8.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
560 |
+
"transformer.h.8.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
561 |
+
"transformer.h.8.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
562 |
+
"transformer.h.8.ln_1.bias": "model-00001-of-00002.safetensors",
|
563 |
+
"transformer.h.8.ln_1.weight": "model-00001-of-00002.safetensors",
|
564 |
+
"transformer.h.8.ln_2.bias": "model-00001-of-00002.safetensors",
|
565 |
+
"transformer.h.8.ln_2.weight": "model-00001-of-00002.safetensors",
|
566 |
+
"transformer.h.8.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
567 |
+
"transformer.h.8.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
568 |
+
"transformer.h.8.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
569 |
+
"transformer.h.8.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
570 |
+
"transformer.h.9.attn.c_attn.bias": "model-00001-of-00002.safetensors",
|
571 |
+
"transformer.h.9.attn.c_attn.weight": "model-00001-of-00002.safetensors",
|
572 |
+
"transformer.h.9.attn.c_proj.bias": "model-00001-of-00002.safetensors",
|
573 |
+
"transformer.h.9.attn.c_proj.weight": "model-00001-of-00002.safetensors",
|
574 |
+
"transformer.h.9.ln_1.bias": "model-00001-of-00002.safetensors",
|
575 |
+
"transformer.h.9.ln_1.weight": "model-00001-of-00002.safetensors",
|
576 |
+
"transformer.h.9.ln_2.bias": "model-00001-of-00002.safetensors",
|
577 |
+
"transformer.h.9.ln_2.weight": "model-00001-of-00002.safetensors",
|
578 |
+
"transformer.h.9.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
|
579 |
+
"transformer.h.9.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
|
580 |
+
"transformer.h.9.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
|
581 |
+
"transformer.h.9.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
|
582 |
+
"transformer.ln_f.bias": "model-00002-of-00002.safetensors",
|
583 |
+
"transformer.ln_f.weight": "model-00002-of-00002.safetensors",
|
584 |
+
"transformer.wpe.weight": "model-00001-of-00002.safetensors",
|
585 |
+
"transformer.wte.weight": "model-00001-of-00002.safetensors"
|
586 |
+
}
|
587 |
+
}
|
sft_pretrain_and_pushtohub.log
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[2025-04-02 14:52:59,306][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -c /tmp/tmpwrka7koq/test.c -o /tmp/tmpwrka7koq/test.o
|
2 |
+
[2025-04-02 14:52:59,327][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat /tmp/tmpwrka7koq/test.o -laio -o /tmp/tmpwrka7koq/a.out
|
3 |
+
[2025-04-02 14:52:59,802][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -c /tmp/tmpbm8f7_mn/test.c -o /tmp/tmpbm8f7_mn/test.o
|
4 |
+
[2025-04-02 14:52:59,820][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat /tmp/tmpbm8f7_mn/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpbm8f7_mn/a.out
|
5 |
+
[2025-04-02 14:52:59,881][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -c /tmp/tmphrk9_0u6/test.c -o /tmp/tmphrk9_0u6/test.o
|
6 |
+
[2025-04-02 14:52:59,896][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat /tmp/tmphrk9_0u6/test.o -laio -o /tmp/tmphrk9_0u6/a.out
|
7 |
+
[2025-04-02 14:53:00,791][__main__][INFO] - *** Starting SFT training at 2025-04-02 14:53:00 ***
|
special_tokens_map.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<|endoftext|>",
|
3 |
+
"eos_token": "<|endoftext|>",
|
4 |
+
"pad_token": "<|endoftext|>",
|
5 |
+
"unk_token": "<|endoftext|>"
|
6 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"50256": {
|
5 |
+
"content": "<|endoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": true,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"bos_token": "<|endoftext|>",
|
14 |
+
"clean_up_tokenization_spaces": false,
|
15 |
+
"eos_token": "<|endoftext|>",
|
16 |
+
"extra_special_tokens": {},
|
17 |
+
"model_max_length": 1024,
|
18 |
+
"pad_token": "<|endoftext|>",
|
19 |
+
"padding_side": "left",
|
20 |
+
"tokenizer_class": "GPT2Tokenizer",
|
21 |
+
"unk_token": "<|endoftext|>"
|
22 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f14f3c6d299bfb369a2106aab54c59a032e03c366e1d1fecdcf02f954b66a25b
|
3 |
+
size 5624
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/debug-internal.log
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-02T14:52:46.998966444Z","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39/wandb/run-20250402_145246-e1n3xkh6/logs/debug-core.log"}
|
2 |
+
{"time":"2025-04-02T14:52:47.119833744Z","level":"INFO","msg":"created new stream","id":"e1n3xkh6"}
|
3 |
+
{"time":"2025-04-02T14:52:47.119882315Z","level":"INFO","msg":"stream: started","id":"e1n3xkh6"}
|
4 |
+
{"time":"2025-04-02T14:52:47.119921969Z","level":"INFO","msg":"handler: started","stream_id":"e1n3xkh6"}
|
5 |
+
{"time":"2025-04-02T14:52:47.119936867Z","level":"INFO","msg":"writer: Do: started","stream_id":"e1n3xkh6"}
|
6 |
+
{"time":"2025-04-02T14:52:47.120603401Z","level":"INFO","msg":"sender: started","stream_id":"e1n3xkh6"}
|
7 |
+
{"time":"2025-04-02T14:52:47.425038021Z","level":"INFO","msg":"Starting system monitor"}
|
wandb/debug.log
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
2 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Configure stats pid to 738
|
3 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Loading settings from /dlabscratch1/amani/.config/wandb/settings
|
4 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Loading settings from /mnt/dlabscratch1/amani/LLM-RL/wandb/settings
|
5 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
6 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39/wandb/run-20250402_145246-e1n3xkh6/logs/debug.log
|
7 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39/wandb/run-20250402_145246-e1n3xkh6/logs/debug-internal.log
|
8 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():761] calling init triggers
|
9 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
10 |
+
config: {'_wandb': {}}
|
11 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():784] starting backend
|
12 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():788] sending inform_init request
|
13 |
+
2025-04-02 14:52:46,987 INFO MainThread:738 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
14 |
+
2025-04-02 14:52:46,987 INFO MainThread:738 [wandb_init.py:init():798] backend started and connected
|
15 |
+
2025-04-02 14:52:46,989 INFO MainThread:738 [wandb_init.py:init():891] updated telemetry
|
16 |
+
2025-04-02 14:52:47,015 INFO MainThread:738 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
17 |
+
2025-04-02 14:52:47,419 INFO MainThread:738 [wandb_init.py:init():990] starting run threads in backend
|
18 |
+
2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_console_start():2375] atexit reg
|
19 |
+
2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
20 |
+
2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
21 |
+
2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_redirect():2315] Redirects installed.
|
22 |
+
2025-04-02 14:52:47,737 INFO MainThread:738 [wandb_init.py:init():1032] run started, returning control to user process
|
23 |
+
2025-04-02 14:53:00,970 INFO MainThread:738 [wandb_run.py:_config_callback():1261] config_cb None None {'vocab_size': 50257, 'n_positions': 1024, 'n_embd': 1600, 'n_layer': 48, 'n_head': 25, 'n_inner': None, 'activation_function': 'gelu_new', 'resid_pdrop': 0.1, 'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'layer_norm_epsilon': 1e-05, 'initializer_range': 0.02, 'summary_type': 'cls_index', 'summary_use_proj': True, 'summary_activation': None, 'summary_first_dropout': 0.1, 'summary_proj_to_labels': True, 'scale_attn_weights': True, 'use_cache': True, 'scale_attn_by_inverse_layer_idx': False, 'reorder_and_upcast_attn': False, 'bos_token_id': 50256, 'eos_token_id': 50256, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['GPT2LMHeadModel'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'pad_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': {'text-generation': {'do_sample': True, 'max_length': 50}}, 'problem_type': None, '_name_or_path': 'openai-community/gpt2-xl', '_attn_implementation_autoset': True, 'transformers_version': '4.49.0', 'model_type': 'gpt2', 'n_ctx': 1024, 'output_past': True, 'output_dir': '/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39', 'overwrite_output_dir': False, 'do_train': 'true,', 'do_eval': 'true,', 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 10, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 10, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': True, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'model_init_kwargs': None, 'use_liger': False, 'dataset_text_field': 'text', 'dataset_kwargs': None, 'dataset_num_proc': None, 'max_seq_length': 1024, 'packing': False, 'eval_packing': None, 'dataset_batch_size': None, 'num_of_sequences': None, 'chars_per_token': '<CHARS_PER_TOKEN>'}
|
24 |
+
2025-04-02 14:53:00,973 INFO MainThread:738 [wandb_config.py:__setitem__():154] config set model/num_parameters = 1557611200 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x734030d35250>>
|
25 |
+
2025-04-02 14:53:00,973 INFO MainThread:738 [wandb_run.py:_config_callback():1261] config_cb model/num_parameters 1557611200 None
|
wandb/run-20250402_145246-e1n3xkh6/files/output.log
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/dlabscratch1/amani/.conda/envs/LLM-RL/lib/python3.11/site-packages/transformers/training_args.py:1594: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
|
2 |
+
warnings.warn(
|
3 |
+
/mnt/dlabscratch1/amani/LLM-RL/src/sft_pretrain_and_pushtohub.py:138: FutureWarning: `tokenizer` is deprecated and removed starting from version 0.16.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
|
4 |
+
trainer = SFTTrainer(
|
5 |
+
[2025-04-02 14:52:58,883] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
6 |
+
Warning: The cache directory for DeepSpeed Triton autotune, /dlabscratch1/amani/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
|
7 |
+
[2025-04-02 14:52:59,306][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -c /tmp/tmpwrka7koq/test.c -o /tmp/tmpwrka7koq/test.o
|
8 |
+
[2025-04-02 14:52:59,327][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat /tmp/tmpwrka7koq/test.o -laio -o /tmp/tmpwrka7koq/a.out
|
9 |
+
[2025-04-02 14:52:59,802][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -c /tmp/tmpbm8f7_mn/test.c -o /tmp/tmpbm8f7_mn/test.o
|
10 |
+
[2025-04-02 14:52:59,820][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat /tmp/tmpbm8f7_mn/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpbm8f7_mn/a.out
|
11 |
+
[2025-04-02 14:52:59,881][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -O2 -isystem /dlabscratch1/amani/.conda/envs/LLM-RL/include -fPIC -c /tmp/tmphrk9_0u6/test.c -o /tmp/tmphrk9_0u6/test.o
|
12 |
+
[2025-04-02 14:52:59,896][root][INFO] - gcc -pthread -B /dlabscratch1/amani/.conda/envs/LLM-RL/compiler_compat /tmp/tmphrk9_0u6/test.o -laio -o /tmp/tmphrk9_0u6/a.out
|
13 |
+
2025-04-02 14:53:00,791 - __main__ - INFO - *** Starting SFT training at 2025-04-02 14:53:00 ***
|
14 |
+
[2025-04-02 14:53:00,791][__main__][INFO] - *** Starting SFT training at 2025-04-02 14:53:00 ***
|
15 |
+
wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
|
16 |
+
0%| | 0/9350 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
|
17 |
+
|
18 |
+
{'loss': 2.8736, 'grad_norm': 4.561432361602783, 'learning_rate': 1.997860962566845e-05, 'mean_token_accuracy': 0.4675643026828766, 'epoch': 0.01}
|
19 |
+
{'loss': 1.4077, 'grad_norm': 3.1172070503234863, 'learning_rate': 1.99572192513369e-05, 'mean_token_accuracy': 0.6618991315364837, 'epoch': 0.02}
|
20 |
+
{'loss': 1.2961, 'grad_norm': 2.483769655227661, 'learning_rate': 1.993582887700535e-05, 'mean_token_accuracy': 0.6864472806453705, 'epoch': 0.03}
|
21 |
+
{'loss': 1.2227, 'grad_norm': 2.7745237350463867, 'learning_rate': 1.9914438502673797e-05, 'mean_token_accuracy': 0.6966653347015381, 'epoch': 0.04}
|
22 |
+
{'loss': 1.0693, 'grad_norm': 2.1610703468322754, 'learning_rate': 1.9893048128342248e-05, 'mean_token_accuracy': 0.7266861200332642, 'epoch': 0.05}
|
23 |
+
{'loss': 1.1503, 'grad_norm': 2.143181324005127, 'learning_rate': 1.9871657754010695e-05, 'mean_token_accuracy': 0.7105863869190217, 'epoch': 0.06}
|
24 |
+
{'loss': 1.065, 'grad_norm': 2.452174425125122, 'learning_rate': 1.9850267379679146e-05, 'mean_token_accuracy': 0.7262609004974365, 'epoch': 0.07}
|
25 |
+
{'loss': 1.011, 'grad_norm': 1.8720413446426392, 'learning_rate': 1.9828877005347596e-05, 'mean_token_accuracy': 0.7376364350318909, 'epoch': 0.09}
|
26 |
+
{'loss': 0.9914, 'grad_norm': 1.9087146520614624, 'learning_rate': 1.9807486631016044e-05, 'mean_token_accuracy': 0.7410379707813263, 'epoch': 0.1}
|
27 |
+
{'loss': 0.9976, 'grad_norm': 2.295886993408203, 'learning_rate': 1.9786096256684494e-05, 'mean_token_accuracy': 0.7375340044498444, 'epoch': 0.11}
|
28 |
+
{'loss': 1.0109, 'grad_norm': 2.035055160522461, 'learning_rate': 1.9764705882352945e-05, 'mean_token_accuracy': 0.7421261131763458, 'epoch': 0.12}
|
29 |
+
{'loss': 1.0253, 'grad_norm': 2.1886730194091797, 'learning_rate': 1.9743315508021392e-05, 'mean_token_accuracy': 0.7308926224708557, 'epoch': 0.13}
|
30 |
+
{'loss': 0.9605, 'grad_norm': 2.1175997257232666, 'learning_rate': 1.972192513368984e-05, 'mean_token_accuracy': 0.7511978864669799, 'epoch': 0.14}
|
31 |
+
{'loss': 0.9502, 'grad_norm': 1.9803975820541382, 'learning_rate': 1.970053475935829e-05, 'mean_token_accuracy': 0.7526978373527526, 'epoch': 0.15}
|
32 |
+
{'loss': 0.9787, 'grad_norm': 2.274048089981079, 'learning_rate': 1.967914438502674e-05, 'mean_token_accuracy': 0.7469079375267029, 'epoch': 0.16}
|
33 |
+
{'loss': 0.9892, 'grad_norm': 2.0520341396331787, 'learning_rate': 1.9657754010695188e-05, 'mean_token_accuracy': 0.7408109784126282, 'epoch': 0.17}
|
34 |
+
{'loss': 0.9155, 'grad_norm': 2.1267733573913574, 'learning_rate': 1.963636363636364e-05, 'mean_token_accuracy': 0.764633697271347, 'epoch': 0.18}
|
35 |
+
{'loss': 0.9464, 'grad_norm': 2.1772565841674805, 'learning_rate': 1.9614973262032086e-05, 'mean_token_accuracy': 0.75131676197052, 'epoch': 0.19}
|
36 |
+
{'loss': 1.0017, 'grad_norm': 2.230823040008545, 'learning_rate': 1.9593582887700536e-05, 'mean_token_accuracy': 0.7450973689556122, 'epoch': 0.2}
|
37 |
+
{'loss': 0.9788, 'grad_norm': 1.783121943473816, 'learning_rate': 1.9572192513368987e-05, 'mean_token_accuracy': 0.747833377122879, 'epoch': 0.21}
|
38 |
+
{'loss': 0.9839, 'grad_norm': 2.2042770385742188, 'learning_rate': 1.9550802139037434e-05, 'mean_token_accuracy': 0.7454769611358643, 'epoch': 0.22}
|
39 |
+
{'loss': 0.9629, 'grad_norm': 1.90845787525177, 'learning_rate': 1.9529411764705885e-05, 'mean_token_accuracy': 0.7495079040527344, 'epoch': 0.24}
|
40 |
+
{'loss': 0.9103, 'grad_norm': 1.7904044389724731, 'learning_rate': 1.9508021390374332e-05, 'mean_token_accuracy': 0.7587033331394195, 'epoch': 0.25}
|
41 |
+
{'loss': 0.9865, 'grad_norm': 2.190483331680298, 'learning_rate': 1.9486631016042783e-05, 'mean_token_accuracy': 0.7439156830310821, 'epoch': 0.26}
|
42 |
+
{'loss': 0.8963, 'grad_norm': 2.020122528076172, 'learning_rate': 1.9465240641711233e-05, 'mean_token_accuracy': 0.7711025416851044, 'epoch': 0.27}
|
43 |
+
{'loss': 0.9623, 'grad_norm': 1.914089560508728, 'learning_rate': 1.944385026737968e-05, 'mean_token_accuracy': 0.748535567522049, 'epoch': 0.28}
|
44 |
+
{'loss': 0.8793, 'grad_norm': 3.000681161880493, 'learning_rate': 1.9422459893048128e-05, 'mean_token_accuracy': 0.7698011755943298, 'epoch': 0.29}
|
45 |
+
{'loss': 0.9989, 'grad_norm': 2.015597105026245, 'learning_rate': 1.9401069518716578e-05, 'mean_token_accuracy': 0.7419908523559571, 'epoch': 0.3}
|
46 |
+
{'loss': 0.9115, 'grad_norm': 1.6983799934387207, 'learning_rate': 1.937967914438503e-05, 'mean_token_accuracy': 0.7678780138492585, 'epoch': 0.31}
|
47 |
+
{'loss': 0.9361, 'grad_norm': 3.0123836994171143, 'learning_rate': 1.9358288770053476e-05, 'mean_token_accuracy': 0.7575121581554413, 'epoch': 0.32}
|
48 |
+
{'loss': 0.9126, 'grad_norm': 2.2199974060058594, 'learning_rate': 1.9336898395721927e-05, 'mean_token_accuracy': 0.763150978088379, 'epoch': 0.33}
|
49 |
+
{'loss': 0.9165, 'grad_norm': 1.953675389289856, 'learning_rate': 1.9315508021390377e-05, 'mean_token_accuracy': 0.7600376307964325, 'epoch': 0.34}
|
50 |
+
{'loss': 0.9716, 'grad_norm': 2.2523326873779297, 'learning_rate': 1.9294117647058825e-05, 'mean_token_accuracy': 0.7498639464378357, 'epoch': 0.35}
|
51 |
+
{'loss': 0.9178, 'grad_norm': 1.7307066917419434, 'learning_rate': 1.9272727272727275e-05, 'mean_token_accuracy': 0.7602478981018066, 'epoch': 0.36}
|
52 |
+
{'loss': 0.9156, 'grad_norm': 1.8145519495010376, 'learning_rate': 1.9251336898395722e-05, 'mean_token_accuracy': 0.7568887352943421, 'epoch': 0.37}
|
53 |
+
{'loss': 0.8619, 'grad_norm': 1.8205516338348389, 'learning_rate': 1.9229946524064173e-05, 'mean_token_accuracy': 0.7767329752445221, 'epoch': 0.39}
|
54 |
+
{'loss': 0.8601, 'grad_norm': 1.7622675895690918, 'learning_rate': 1.9208556149732624e-05, 'mean_token_accuracy': 0.7761210620403289, 'epoch': 0.4}
|
55 |
+
{'loss': 0.8928, 'grad_norm': 2.1755974292755127, 'learning_rate': 1.918716577540107e-05, 'mean_token_accuracy': 0.7695048809051513, 'epoch': 0.41}
|
56 |
+
{'loss': 0.8501, 'grad_norm': 1.590783953666687, 'learning_rate': 1.9165775401069518e-05, 'mean_token_accuracy': 0.7792985320091248, 'epoch': 0.42}
|
57 |
+
{'loss': 0.926, 'grad_norm': 1.8423107862472534, 'learning_rate': 1.9144385026737972e-05, 'mean_token_accuracy': 0.7609580457210541, 'epoch': 0.43}
|
58 |
+
{'loss': 0.9162, 'grad_norm': 1.4484622478485107, 'learning_rate': 1.912299465240642e-05, 'mean_token_accuracy': 0.7642469525337219, 'epoch': 0.44}
|
59 |
+
{'loss': 0.9032, 'grad_norm': 1.7720240354537964, 'learning_rate': 1.9101604278074867e-05, 'mean_token_accuracy': 0.7663418650627136, 'epoch': 0.45}
|
60 |
+
{'loss': 0.8943, 'grad_norm': 1.9300682544708252, 'learning_rate': 1.9080213903743317e-05, 'mean_token_accuracy': 0.7683996140956879, 'epoch': 0.46}
|
61 |
+
{'loss': 0.8782, 'grad_norm': 2.139838218688965, 'learning_rate': 1.9058823529411764e-05, 'mean_token_accuracy': 0.7707085013389587, 'epoch': 0.47}
|
62 |
+
{'loss': 0.865, 'grad_norm': 1.7625609636306763, 'learning_rate': 1.9037433155080215e-05, 'mean_token_accuracy': 0.7711307823657989, 'epoch': 0.48}
|
63 |
+
{'loss': 0.9055, 'grad_norm': 1.9418359994888306, 'learning_rate': 1.9016042780748666e-05, 'mean_token_accuracy': 0.7585177183151245, 'epoch': 0.49}
|
64 |
+
{'loss': 0.8507, 'grad_norm': 2.0598695278167725, 'learning_rate': 1.8994652406417113e-05, 'mean_token_accuracy': 0.7708562433719635, 'epoch': 0.5}
|
65 |
+
{'loss': 0.8692, 'grad_norm': 1.5901210308074951, 'learning_rate': 1.8973262032085563e-05, 'mean_token_accuracy': 0.7684442400932312, 'epoch': 0.51}
|
66 |
+
{'loss': 0.8778, 'grad_norm': 1.8863409757614136, 'learning_rate': 1.8951871657754014e-05, 'mean_token_accuracy': 0.7689958155155182, 'epoch': 0.52}
|
67 |
+
{'loss': 0.8592, 'grad_norm': 1.6788251399993896, 'learning_rate': 1.893048128342246e-05, 'mean_token_accuracy': 0.7720810234546661, 'epoch': 0.53}
|
68 |
+
{'loss': 0.8527, 'grad_norm': 1.7482951879501343, 'learning_rate': 1.8909090909090912e-05, 'mean_token_accuracy': 0.7757346272468567, 'epoch': 0.55}
|
69 |
+
{'loss': 0.8633, 'grad_norm': 1.570914626121521, 'learning_rate': 1.888770053475936e-05, 'mean_token_accuracy': 0.7743308961391449, 'epoch': 0.56}
|
70 |
+
{'loss': 0.8701, 'grad_norm': 1.7534855604171753, 'learning_rate': 1.886631016042781e-05, 'mean_token_accuracy': 0.7677632629871368, 'epoch': 0.57}
|
71 |
+
{'loss': 0.8179, 'grad_norm': 2.0430619716644287, 'learning_rate': 1.8844919786096257e-05, 'mean_token_accuracy': 0.7785910904407501, 'epoch': 0.58}
|
72 |
+
{'loss': 0.8518, 'grad_norm': 1.8052802085876465, 'learning_rate': 1.8823529411764708e-05, 'mean_token_accuracy': 0.7752408146858215, 'epoch': 0.59}
|
73 |
+
{'loss': 0.8174, 'grad_norm': 1.7275725603103638, 'learning_rate': 1.8802139037433155e-05, 'mean_token_accuracy': 0.7837619543075561, 'epoch': 0.6}
|
74 |
+
{'loss': 0.8424, 'grad_norm': 2.0164926052093506, 'learning_rate': 1.8780748663101605e-05, 'mean_token_accuracy': 0.7713942766189575, 'epoch': 0.61}
|
75 |
+
{'loss': 0.7643, 'grad_norm': 1.6844583749771118, 'learning_rate': 1.8759358288770056e-05, 'mean_token_accuracy': 0.7957529544830322, 'epoch': 0.62}
|
76 |
+
{'loss': 0.869, 'grad_norm': 1.9102866649627686, 'learning_rate': 1.8737967914438503e-05, 'mean_token_accuracy': 0.7759682476520539, 'epoch': 0.63}
|
77 |
+
{'loss': 0.7768, 'grad_norm': 1.379757285118103, 'learning_rate': 1.8716577540106954e-05, 'mean_token_accuracy': 0.7940125286579132, 'epoch': 0.64}
|
78 |
+
{'loss': 0.8424, 'grad_norm': 1.7400151491165161, 'learning_rate': 1.8695187165775405e-05, 'mean_token_accuracy': 0.7720341801643371, 'epoch': 0.65}
|
79 |
+
{'loss': 0.8636, 'grad_norm': 2.172954559326172, 'learning_rate': 1.8673796791443852e-05, 'mean_token_accuracy': 0.775877845287323, 'epoch': 0.66}
|
80 |
+
{'loss': 0.8008, 'grad_norm': 1.9168498516082764, 'learning_rate': 1.8652406417112302e-05, 'mean_token_accuracy': 0.7895217001438141, 'epoch': 0.67}
|
81 |
+
{'loss': 0.7755, 'grad_norm': 1.5433951616287231, 'learning_rate': 1.863101604278075e-05, 'mean_token_accuracy': 0.7920072257518769, 'epoch': 0.68}
|
82 |
+
{'loss': 0.8503, 'grad_norm': 2.0785927772521973, 'learning_rate': 1.86096256684492e-05, 'mean_token_accuracy': 0.7770399391651154, 'epoch': 0.7}
|
83 |
+
{'loss': 0.7462, 'grad_norm': 1.9140806198120117, 'learning_rate': 1.8588235294117647e-05, 'mean_token_accuracy': 0.8053540170192719, 'epoch': 0.71}
|
84 |
+
{'loss': 0.7462, 'grad_norm': 1.7646123170852661, 'learning_rate': 1.8566844919786098e-05, 'mean_token_accuracy': 0.7958488464355469, 'epoch': 0.72}
|
85 |
+
{'loss': 0.8468, 'grad_norm': 1.6575416326522827, 'learning_rate': 1.8545454545454545e-05, 'mean_token_accuracy': 0.7770686745643616, 'epoch': 0.73}
|
86 |
+
{'loss': 0.8494, 'grad_norm': 1.7693356275558472, 'learning_rate': 1.8524064171122996e-05, 'mean_token_accuracy': 0.7740375459194183, 'epoch': 0.74}
|
87 |
+
{'loss': 0.7965, 'grad_norm': 1.6074458360671997, 'learning_rate': 1.8502673796791447e-05, 'mean_token_accuracy': 0.7881363987922668, 'epoch': 0.75}
|
88 |
+
{'loss': 0.8626, 'grad_norm': 1.7979710102081299, 'learning_rate': 1.8481283422459894e-05, 'mean_token_accuracy': 0.7758695363998414, 'epoch': 0.76}
|
89 |
+
{'loss': 0.7883, 'grad_norm': 1.6999515295028687, 'learning_rate': 1.8459893048128344e-05, 'mean_token_accuracy': 0.7871046781539917, 'epoch': 0.77}
|
90 |
+
{'loss': 0.8218, 'grad_norm': 1.8012199401855469, 'learning_rate': 1.843850267379679e-05, 'mean_token_accuracy': 0.7894359171390534, 'epoch': 0.78}
|
91 |
+
{'loss': 0.8249, 'grad_norm': 1.8291058540344238, 'learning_rate': 1.8417112299465242e-05, 'mean_token_accuracy': 0.7786098062992096, 'epoch': 0.79}
|
92 |
+
{'loss': 0.849, 'grad_norm': 1.459100604057312, 'learning_rate': 1.8395721925133693e-05, 'mean_token_accuracy': 0.7739375293254852, 'epoch': 0.8}
|
93 |
+
{'loss': 0.8105, 'grad_norm': 1.6709809303283691, 'learning_rate': 1.837433155080214e-05, 'mean_token_accuracy': 0.7830919861793518, 'epoch': 0.81}
|
94 |
+
{'loss': 0.8016, 'grad_norm': 1.8294044733047485, 'learning_rate': 1.8352941176470587e-05, 'mean_token_accuracy': 0.7827704012393951, 'epoch': 0.82}
|
95 |
+
{'loss': 0.7922, 'grad_norm': 1.9289802312850952, 'learning_rate': 1.833155080213904e-05, 'mean_token_accuracy': 0.7901956796646118, 'epoch': 0.83}
|
96 |
+
{'loss': 0.7928, 'grad_norm': 1.6650038957595825, 'learning_rate': 1.831016042780749e-05, 'mean_token_accuracy': 0.7919930636882782, 'epoch': 0.84}
|
97 |
+
{'loss': 0.7958, 'grad_norm': 1.7201720476150513, 'learning_rate': 1.8288770053475936e-05, 'mean_token_accuracy': 0.7823857426643371, 'epoch': 0.86}
|
98 |
+
{'loss': 0.8059, 'grad_norm': 1.5870884656906128, 'learning_rate': 1.8267379679144386e-05, 'mean_token_accuracy': 0.7836263060569764, 'epoch': 0.87}
|
99 |
+
{'loss': 0.75, 'grad_norm': 1.7115275859832764, 'learning_rate': 1.8245989304812837e-05, 'mean_token_accuracy': 0.8022487759590149, 'epoch': 0.88}
|
100 |
+
{'loss': 0.8185, 'grad_norm': 2.0211620330810547, 'learning_rate': 1.8224598930481284e-05, 'mean_token_accuracy': 0.7780875384807586, 'epoch': 0.89}
|
101 |
+
{'loss': 0.7877, 'grad_norm': 1.6509720087051392, 'learning_rate': 1.8203208556149735e-05, 'mean_token_accuracy': 0.7959480166435242, 'epoch': 0.9}
|
102 |
+
{'loss': 0.8499, 'grad_norm': 1.8145709037780762, 'learning_rate': 1.8181818181818182e-05, 'mean_token_accuracy': 0.7782252907752991, 'epoch': 0.91}
|
103 |
+
{'loss': 0.7717, 'grad_norm': 1.8439884185791016, 'learning_rate': 1.8160427807486633e-05, 'mean_token_accuracy': 0.7995142638683319, 'epoch': 0.92}
|
104 |
+
{'loss': 0.8012, 'grad_norm': 1.5468418598175049, 'learning_rate': 1.8139037433155083e-05, 'mean_token_accuracy': 0.7799870669841766, 'epoch': 0.93}
|
105 |
+
{'loss': 0.9315, 'grad_norm': 1.8522837162017822, 'learning_rate': 1.811764705882353e-05, 'mean_token_accuracy': 0.7644379436969757, 'epoch': 0.94}
|
106 |
+
{'loss': 0.8311, 'grad_norm': 1.6274827718734741, 'learning_rate': 1.809625668449198e-05, 'mean_token_accuracy': 0.7880046725273132, 'epoch': 0.95}
|
107 |
+
{'loss': 0.8642, 'grad_norm': 1.9474292993545532, 'learning_rate': 1.807486631016043e-05, 'mean_token_accuracy': 0.7756262719631195, 'epoch': 0.96}
|
108 |
+
{'loss': 0.7447, 'grad_norm': 1.9583537578582764, 'learning_rate': 1.805347593582888e-05, 'mean_token_accuracy': 0.8041338086128235, 'epoch': 0.97}
|
109 |
+
{'loss': 0.7824, 'grad_norm': 1.5204691886901855, 'learning_rate': 1.8032085561497326e-05, 'mean_token_accuracy': 0.7936776518821717, 'epoch': 0.98}
|
110 |
+
{'loss': 0.8073, 'grad_norm': 1.6397240161895752, 'learning_rate': 1.8010695187165777e-05, 'mean_token_accuracy': 0.7917199492454529, 'epoch': 0.99}
|
111 |
+
training_args.bin: 100%|██████████| 5.62k/5.62k [00:00<00:00, 38.0kB/s] ?B/s]
|
112 |
+
{'eval_loss': 0.7610637545585632, 'eval_runtime': 97.5543, 'eval_samples_per_second': 13.521, 'eval_steps_per_second': 0.851, 'eval_mean_token_accuracy': 0.7973010553555056, 'epoch': 1.0}
|
113 |
+
run-e1n3xkh6.wandb: 100%|██████████| 360k/360k [00:00<00:00, 604kB/s]<07:44, 10.7MB/s]
|
114 |
+
model-00002-of-00002.safetensors: 100%|██████████| 1.27G/1.27G [00:36<00:00, 35.2MB/s]
|
115 |
+
model-00002-of-00002.safetensors: 100%|██████████| 1.27G/1.27G [00:44<00:00, 28.7MB/s]
|
116 |
+
model-00001-of-00002.safetensors: 100%|██████████| 4.96G/4.96G [02:15<00:00, 36.5MB/s]
|
117 |
+
model-00001-of-00002.safetensors: 100%|██████████| 4.96G/4.96G [02:16<00:00, 36.3MB/s]
|
118 |
+
Upload 7 LFS files: 100%|██████████| 7/7 [02:17<00:00, 19.59s/it]0:35<01:32, 41.1MB/s]
|
119 |
+
model-00001-of-00002.safetensors: 30%|███ | 1.49G/4.96G [00:44<01:24, 40.9MB/s]
|
120 |
+
model-00001-of-00002.safetensors: 99%|█████████▉| 4.93G/4.96G [02:15<00:00, 47.1MB/s]
|
121 |
+
model-00001-of-00002.safetensors: 100%|█████████▉| 4.96G/4.96G [02:16<00:00, 49.1MB/s]
|
122 |
+
Upload 7 LFS files: 14%|█▍ | 1/7 [02:16<13:37, 136.19s/it]
|
123 |
+
Upload 7 LFS files: 57%|█████▋ | 4/7 [02:17<01:18, 26.02s/it]
|
wandb/run-20250402_145246-e1n3xkh6/files/requirements.txt
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wheel==0.45.1
|
2 |
+
pip==25.0.1
|
3 |
+
tensorboard-data-server==0.7.2
|
4 |
+
six==1.17.0
|
5 |
+
setuptools==70.3.0
|
6 |
+
packaging==24.2
|
7 |
+
MarkupSafe==3.0.2
|
8 |
+
Markdown==3.7
|
9 |
+
grpcio==1.71.0
|
10 |
+
absl-py==2.1.0
|
11 |
+
Werkzeug==3.1.3
|
12 |
+
tensorboard==2.19.0
|
13 |
+
pytz==2025.1
|
14 |
+
py-cpuinfo==9.0.0
|
15 |
+
nvidia-cusparselt-cu12==0.6.2
|
16 |
+
mpmath==1.3.0
|
17 |
+
hjson==3.1.0
|
18 |
+
xxhash==3.5.0
|
19 |
+
urllib3==2.3.0
|
20 |
+
tzdata==2025.1
|
21 |
+
typing_extensions==4.12.2
|
22 |
+
tqdm==4.67.1
|
23 |
+
sympy==1.13.1
|
24 |
+
safetensors==0.5.3
|
25 |
+
regex==2024.11.6
|
26 |
+
PyYAML==6.0.2
|
27 |
+
python-dateutil==2.9.0.post0
|
28 |
+
Pygments==2.19.1
|
29 |
+
pyarrow==19.0.1
|
30 |
+
psutil==7.0.0
|
31 |
+
propcache==0.3.0
|
32 |
+
nvidia-nvtx-cu12==12.4.127
|
33 |
+
nvidia-nvjitlink-cu12==12.4.127
|
34 |
+
nvidia-nccl-cu12==2.21.5
|
35 |
+
nvidia-curand-cu12==10.3.5.147
|
36 |
+
nvidia-cufft-cu12==11.2.1.3
|
37 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
38 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
39 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
40 |
+
nvidia-cublas-cu12==12.4.5.8
|
41 |
+
ninja==1.11.1.3
|
42 |
+
networkx==3.4.2
|
43 |
+
multidict==6.1.0
|
44 |
+
msgpack==1.1.0
|
45 |
+
mdurl==0.1.2
|
46 |
+
Jinja2==3.1.6
|
47 |
+
idna==3.10
|
48 |
+
hf_transfer==0.1.9
|
49 |
+
fsspec==2024.9.0
|
50 |
+
frozenlist==1.5.0
|
51 |
+
filelock==3.18.0
|
52 |
+
dill==0.3.8
|
53 |
+
charset-normalizer==3.4.1
|
54 |
+
certifi==2025.1.31
|
55 |
+
attrs==25.3.0
|
56 |
+
annotated-types==0.7.0
|
57 |
+
aiohappyeyeballs==2.6.1
|
58 |
+
yarl==1.18.3
|
59 |
+
requests==2.32.3
|
60 |
+
pydantic_core==2.27.2
|
61 |
+
pandas==2.2.3
|
62 |
+
nvidia-cusparse-cu12==12.3.1.170
|
63 |
+
nvidia-cudnn-cu12==9.1.0.70
|
64 |
+
multiprocess==0.70.16
|
65 |
+
markdown-it-py==3.0.0
|
66 |
+
aiosignal==1.3.2
|
67 |
+
rich==13.9.4
|
68 |
+
pydantic==2.10.6
|
69 |
+
nvidia-cusolver-cu12==11.6.1.9
|
70 |
+
huggingface-hub==0.29.3
|
71 |
+
aiohttp==3.11.13
|
72 |
+
tokenizers==0.21.1
|
73 |
+
deepspeed==0.15.4
|
74 |
+
datasets==3.1.0
|
75 |
+
accelerate==1.3.0
|
76 |
+
trl==0.15.2
|
77 |
+
nvidia-ml-py==12.570.86
|
78 |
+
smmap==5.0.2
|
79 |
+
setproctitle==1.3.5
|
80 |
+
sentry-sdk==2.22.0
|
81 |
+
protobuf==5.29.3
|
82 |
+
platformdirs==4.3.6
|
83 |
+
docker-pycreds==0.4.0
|
84 |
+
click==8.1.8
|
85 |
+
gitdb==4.0.12
|
86 |
+
GitPython==3.1.44
|
87 |
+
wandb==0.19.8
|
88 |
+
sentencepiece==0.2.0
|
89 |
+
fastrlock==0.8.3
|
90 |
+
blake3==1.0.4
|
91 |
+
zipp==3.21.0
|
92 |
+
websockets==15.0.1
|
93 |
+
uvloop==0.21.0
|
94 |
+
triton==3.1.0
|
95 |
+
sniffio==1.3.1
|
96 |
+
shellingham==1.5.4
|
97 |
+
rpds-py==0.23.1
|
98 |
+
pyzmq==26.3.0
|
99 |
+
python-multipart==0.0.20
|
100 |
+
python-dotenv==1.0.1
|
101 |
+
pycountry==24.6.1
|
102 |
+
pybind11==2.13.6
|
103 |
+
prometheus_client==0.21.1
|
104 |
+
pluggy==1.5.0
|
105 |
+
pillow==11.1.0
|
106 |
+
partial-json-parser==0.2.1.1.post5
|
107 |
+
numpy==1.26.4
|
108 |
+
nest-asyncio==1.6.0
|
109 |
+
msgspec==0.19.0
|
110 |
+
llvmlite==0.43.0
|
111 |
+
lark==1.2.2
|
112 |
+
jiter==0.9.0
|
113 |
+
interegular==0.3.3
|
114 |
+
iniconfig==2.0.0
|
115 |
+
httptools==0.6.4
|
116 |
+
h11==0.14.0
|
117 |
+
einops==0.8.1
|
118 |
+
dnspython==2.7.0
|
119 |
+
distro==1.9.0
|
120 |
+
diskcache==5.6.3
|
121 |
+
cloudpickle==3.1.1
|
122 |
+
astor==0.8.1
|
123 |
+
airportsdata==20250224
|
124 |
+
uvicorn==0.34.0
|
125 |
+
tiktoken==0.9.0
|
126 |
+
referencing==0.36.2
|
127 |
+
pytest==8.3.5
|
128 |
+
opencv-python-headless==4.11.0.86
|
129 |
+
numba==0.60.0
|
130 |
+
importlib_metadata==8.6.1
|
131 |
+
httpcore==1.0.7
|
132 |
+
gguf==0.10.0
|
133 |
+
email_validator==2.2.0
|
134 |
+
depyf==0.18.0
|
135 |
+
cupy-cuda12x==13.4.0
|
136 |
+
anyio==4.8.0
|
137 |
+
watchfiles==1.0.4
|
138 |
+
typer==0.15.2
|
139 |
+
torch==2.5.1
|
140 |
+
starlette==0.46.1
|
141 |
+
rich-toolkit==0.13.2
|
142 |
+
lm-format-enforcer==0.10.11
|
143 |
+
jsonschema-specifications==2024.10.1
|
144 |
+
httpx==0.28.1
|
145 |
+
xformers==0.0.28.post3
|
146 |
+
transformers==4.49.0
|
147 |
+
torchvision==0.20.1
|
148 |
+
torchaudio==2.5.1
|
149 |
+
prometheus-fastapi-instrumentator==7.0.2
|
150 |
+
openai==1.66.3
|
151 |
+
jsonschema==4.23.0
|
152 |
+
fastapi==0.115.11
|
153 |
+
xgrammar==0.1.11
|
154 |
+
ray==2.40.0
|
155 |
+
outlines_core==0.1.26
|
156 |
+
mistral_common==1.5.3
|
157 |
+
fastapi-cli==0.0.7
|
158 |
+
compressed-tensors==0.9.1
|
159 |
+
outlines==0.1.11
|
160 |
+
vllm==0.7.3
|
161 |
+
antlr4-python3-runtime==4.9.3
|
162 |
+
omegaconf==2.3.0
|
163 |
+
hydra-core==1.3.2
|
164 |
+
rootutils==1.0.7
|
wandb/run-20250402_145246-e1n3xkh6/files/wandb-metadata.json
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-6.5.0-45-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "CPython 3.11.11",
|
4 |
+
"startedAt": "2025-04-02T14:52:46.988352Z",
|
5 |
+
"args": [
|
6 |
+
"model=gpt2xl_1.5b",
|
7 |
+
"task=gsm8k"
|
8 |
+
],
|
9 |
+
"program": "/mnt/dlabscratch1/amani/LLM-RL/src/sft_pretrain_and_pushtohub.py",
|
10 |
+
"codePath": "src/sft_pretrain_and_pushtohub.py",
|
11 |
+
"git": {
|
12 |
+
"remote": "https://github.com/aryol/LLM-RL.git",
|
13 |
+
"commit": "af916ff96a9a9f7ba10303eca8d36be0bbd89fc8"
|
14 |
+
},
|
15 |
+
"email": "[email protected]",
|
16 |
+
"root": "/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39",
|
17 |
+
"host": "job-60b06e05eaef-0-0",
|
18 |
+
"executable": "/dlabscratch1/amani/.conda/envs/LLM-RL/bin/python",
|
19 |
+
"codePathLocal": "src/sft_pretrain_and_pushtohub.py",
|
20 |
+
"cpu_count": 64,
|
21 |
+
"cpu_count_logical": 128,
|
22 |
+
"gpu": "NVIDIA A100-SXM4-80GB",
|
23 |
+
"gpu_count": 1,
|
24 |
+
"disk": {
|
25 |
+
"/": {
|
26 |
+
"total": "7679362727936",
|
27 |
+
"used": "4235631878144"
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"memory": {
|
31 |
+
"total": "1081887248384"
|
32 |
+
},
|
33 |
+
"cpu": {
|
34 |
+
"count": 64,
|
35 |
+
"countLogical": 128
|
36 |
+
},
|
37 |
+
"gpu_nvidia": [
|
38 |
+
{
|
39 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
40 |
+
"memoryTotal": "85899345920",
|
41 |
+
"cudaCores": 6912,
|
42 |
+
"architecture": "Ampere"
|
43 |
+
}
|
44 |
+
],
|
45 |
+
"cudaVersion": "12.4"
|
46 |
+
}
|
wandb/run-20250402_145246-e1n3xkh6/logs/debug-core.log
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-02T14:52:46.462612938Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpw6oemm4w/port-738.txt","pid":738,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
|
2 |
+
{"time":"2025-04-02T14:52:46.466433118Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":738}
|
3 |
+
{"time":"2025-04-02T14:52:46.466813128Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":38271,"Zone":""}}
|
4 |
+
{"time":"2025-04-02T14:52:46.55291915Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:57336"}
|
5 |
+
{"time":"2025-04-02T14:52:46.99572319Z","level":"INFO","msg":"handleInformInit: received","streamId":"e1n3xkh6","id":"127.0.0.1:57336"}
|
6 |
+
{"time":"2025-04-02T14:52:47.119891001Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"e1n3xkh6","id":"127.0.0.1:57336"}
|
wandb/run-20250402_145246-e1n3xkh6/logs/debug-internal.log
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-02T14:52:46.998966444Z","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39/wandb/run-20250402_145246-e1n3xkh6/logs/debug-core.log"}
|
2 |
+
{"time":"2025-04-02T14:52:47.119833744Z","level":"INFO","msg":"created new stream","id":"e1n3xkh6"}
|
3 |
+
{"time":"2025-04-02T14:52:47.119882315Z","level":"INFO","msg":"stream: started","id":"e1n3xkh6"}
|
4 |
+
{"time":"2025-04-02T14:52:47.119921969Z","level":"INFO","msg":"handler: started","stream_id":"e1n3xkh6"}
|
5 |
+
{"time":"2025-04-02T14:52:47.119936867Z","level":"INFO","msg":"writer: Do: started","stream_id":"e1n3xkh6"}
|
6 |
+
{"time":"2025-04-02T14:52:47.120603401Z","level":"INFO","msg":"sender: started","stream_id":"e1n3xkh6"}
|
7 |
+
{"time":"2025-04-02T14:52:47.425038021Z","level":"INFO","msg":"Starting system monitor"}
|
wandb/run-20250402_145246-e1n3xkh6/logs/debug.log
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
2 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Configure stats pid to 738
|
3 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Loading settings from /dlabscratch1/amani/.config/wandb/settings
|
4 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Loading settings from /mnt/dlabscratch1/amani/LLM-RL/wandb/settings
|
5 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
6 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39/wandb/run-20250402_145246-e1n3xkh6/logs/debug.log
|
7 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39/wandb/run-20250402_145246-e1n3xkh6/logs/debug-internal.log
|
8 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():761] calling init triggers
|
9 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
10 |
+
config: {'_wandb': {}}
|
11 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():784] starting backend
|
12 |
+
2025-04-02 14:52:46,977 INFO MainThread:738 [wandb_init.py:init():788] sending inform_init request
|
13 |
+
2025-04-02 14:52:46,987 INFO MainThread:738 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
14 |
+
2025-04-02 14:52:46,987 INFO MainThread:738 [wandb_init.py:init():798] backend started and connected
|
15 |
+
2025-04-02 14:52:46,989 INFO MainThread:738 [wandb_init.py:init():891] updated telemetry
|
16 |
+
2025-04-02 14:52:47,015 INFO MainThread:738 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
17 |
+
2025-04-02 14:52:47,419 INFO MainThread:738 [wandb_init.py:init():990] starting run threads in backend
|
18 |
+
2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_console_start():2375] atexit reg
|
19 |
+
2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
20 |
+
2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
21 |
+
2025-04-02 14:52:47,731 INFO MainThread:738 [wandb_run.py:_redirect():2315] Redirects installed.
|
22 |
+
2025-04-02 14:52:47,737 INFO MainThread:738 [wandb_init.py:init():1032] run started, returning control to user process
|
23 |
+
2025-04-02 14:53:00,970 INFO MainThread:738 [wandb_run.py:_config_callback():1261] config_cb None None {'vocab_size': 50257, 'n_positions': 1024, 'n_embd': 1600, 'n_layer': 48, 'n_head': 25, 'n_inner': None, 'activation_function': 'gelu_new', 'resid_pdrop': 0.1, 'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'layer_norm_epsilon': 1e-05, 'initializer_range': 0.02, 'summary_type': 'cls_index', 'summary_use_proj': True, 'summary_activation': None, 'summary_first_dropout': 0.1, 'summary_proj_to_labels': True, 'scale_attn_weights': True, 'use_cache': True, 'scale_attn_by_inverse_layer_idx': False, 'reorder_and_upcast_attn': False, 'bos_token_id': 50256, 'eos_token_id': 50256, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['GPT2LMHeadModel'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'pad_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': {'text-generation': {'do_sample': True, 'max_length': 50}}, 'problem_type': None, '_name_or_path': 'openai-community/gpt2-xl', '_attn_implementation_autoset': True, 'transformers_version': '4.49.0', 'model_type': 'gpt2', 'n_ctx': 1024, 'output_past': True, 'output_dir': '/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39', 'overwrite_output_dir': False, 'do_train': 'true,', 'do_eval': 'true,', 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 10, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 10, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/mnt/dlabscratch1/amani/LLM-RL/logs/sft_pretrain_and_pushtohub/gsm8k-gpt2-xl/2025-04-02_14-52-39', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': True, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'model_init_kwargs': None, 'use_liger': False, 'dataset_text_field': 'text', 'dataset_kwargs': None, 'dataset_num_proc': None, 'max_seq_length': 1024, 'packing': False, 'eval_packing': None, 'dataset_batch_size': None, 'num_of_sequences': None, 'chars_per_token': '<CHARS_PER_TOKEN>'}
|
24 |
+
2025-04-02 14:53:00,973 INFO MainThread:738 [wandb_config.py:__setitem__():154] config set model/num_parameters = 1557611200 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x734030d35250>>
|
25 |
+
2025-04-02 14:53:00,973 INFO MainThread:738 [wandb_run.py:_config_callback():1261] config_cb model/num_parameters 1557611200 None
|
wandb/run-20250402_145246-e1n3xkh6/run-e1n3xkh6.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:868d387659ad2157b3ef2be17b045544e2cc5ec08c48443a8f782234db3a58e7
|
3 |
+
size 360448
|