wassname commited on
Commit
f0a6d5f
·
verified ·
1 Parent(s): 425d7f0

End of training

Browse files
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: HuggingFaceTB/SmolLM2-360M
3
+ datasets:
4
+ - wassname/ultrachat_200k_filtered
5
+ library_name: transformers
6
+ model_name: SmolLM2-360M-sft
7
+ tags:
8
+ - generated_from_trainer
9
+ - alignment-handbook
10
+ licence: license
11
+ ---
12
+
13
+ # Model Card for SmolLM2-360M-sft
14
+
15
+ This model is a fine-tuned version of [HuggingFaceTB/SmolLM2-360M](https://huggingface.co/HuggingFaceTB/SmolLM2-360M) on the [['wassname/ultrachat_200k_filtered']](https://huggingface.co/datasets/['wassname/ultrachat_200k_filtered']) dataset.
16
+ It has been trained using [TRL](https://github.com/huggingface/trl).
17
+
18
+ ## Quick start
19
+
20
+ ```python
21
+ from transformers import pipeline
22
+
23
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
24
+ generator = pipeline("text-generation", model="None", device="cuda")
25
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
26
+ print(output["generated_text"])
27
+ ```
28
+
29
+ ## Training procedure
30
+
31
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/wassname/huggingface/runs/gs4a36gl)
32
+
33
+ This model was trained with SFT.
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.12.1
38
+ - Transformers: 4.52.4
39
+ - Pytorch: 2.7.0
40
+ - Datasets: 3.6.0
41
+ - Tokenizers: 0.21.1
42
+
43
+ ## Citations
44
+
45
+
46
+
47
+ Cite TRL as:
48
+
49
+ ```bibtex
50
+ @misc{vonwerra2022trl,
51
+ title = {{TRL: Transformer Reinforcement Learning}},
52
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
53
+ year = 2020,
54
+ journal = {GitHub repository},
55
+ publisher = {GitHub},
56
+ howpublished = {\url{https://github.com/huggingface/trl}}
57
+ }
58
+ ```
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_loss": 1.4015671014785767,
4
+ "eval_model_preparation_time": 0.0049,
5
+ "eval_runtime": 340.7516,
6
+ "eval_samples": 13188,
7
+ "eval_samples_per_second": 38.703,
8
+ "eval_steps_per_second": 2.421,
9
+ "total_flos": 7.533943292711404e+17,
10
+ "train_loss": 1.4212341141200129,
11
+ "train_runtime": 13229.3679,
12
+ "train_samples": 117772,
13
+ "train_samples_per_second": 14.73,
14
+ "train_steps_per_second": 0.058
15
+ }
chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ ' + message['content'] + '<|im_end|>' + '
3
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
+ ' }}{% endif %}
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 960,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 2560,
14
+ "is_llama_config": true,
15
+ "max_position_embeddings": 8192,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 15,
19
+ "num_hidden_layers": 32,
20
+ "num_key_value_heads": 5,
21
+ "pad_token_id": 2,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_interleaved": false,
25
+ "rope_scaling": null,
26
+ "rope_theta": 100000,
27
+ "tie_word_embeddings": true,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.52.4",
30
+ "use_cache": true,
31
+ "vocab_size": 49152
32
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_loss": 1.4015671014785767,
3
+ "eval_model_preparation_time": 0.0049,
4
+ "eval_runtime": 340.7516,
5
+ "eval_samples": 13188,
6
+ "eval_samples_per_second": 38.703,
7
+ "eval_steps_per_second": 2.421
8
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.52.4"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bace36a3e20c79d6b33a6ad29028cf33de1f1e2c785a520488ac7f6d00e88653
3
+ size 723674912
special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<|im_start|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|im_end|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<repo_name>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<reponame>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "5": {
46
+ "content": "<file_sep>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "6": {
54
+ "content": "<filename>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "7": {
62
+ "content": "<gh_stars>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "8": {
70
+ "content": "<issue_start>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "9": {
78
+ "content": "<issue_comment>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "10": {
86
+ "content": "<issue_closed>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "11": {
94
+ "content": "<jupyter_start>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "12": {
102
+ "content": "<jupyter_text>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "13": {
110
+ "content": "<jupyter_code>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "14": {
118
+ "content": "<jupyter_output>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "15": {
126
+ "content": "<jupyter_script>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "16": {
134
+ "content": "<empty_output>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": true
140
+ }
141
+ },
142
+ "additional_special_tokens": [
143
+ "<|im_start|>",
144
+ "<|im_end|>"
145
+ ],
146
+ "bos_token": "<|im_start|>",
147
+ "clean_up_tokenization_spaces": false,
148
+ "eos_token": "<|im_end|>",
149
+ "errors": "replace",
150
+ "extra_special_tokens": {},
151
+ "model_max_length": 8192,
152
+ "pad_token": "<|im_end|>",
153
+ "tokenizer_class": "GPT2Tokenizer",
154
+ "unk_token": "<|endoftext|>",
155
+ "vocab_size": 49152
156
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 7.533943292711404e+17,
4
+ "train_loss": 1.4212341141200129,
5
+ "train_runtime": 13229.3679,
6
+ "train_samples": 117772,
7
+ "train_samples_per_second": 14.73,
8
+ "train_steps_per_second": 0.058
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 200,
7
+ "global_step": 762,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.003940886699507389,
14
+ "grad_norm": 1.078125,
15
+ "learning_rate": 0.0,
16
+ "loss": 1.7215,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.019704433497536946,
21
+ "grad_norm": 1.078125,
22
+ "learning_rate": 1.038961038961039e-05,
23
+ "loss": 1.7672,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 0.03940886699507389,
28
+ "grad_norm": 0.99609375,
29
+ "learning_rate": 2.3376623376623376e-05,
30
+ "loss": 1.7749,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 0.059113300492610835,
35
+ "grad_norm": 0.8828125,
36
+ "learning_rate": 3.6363636363636364e-05,
37
+ "loss": 1.7334,
38
+ "step": 15
39
+ },
40
+ {
41
+ "epoch": 0.07881773399014778,
42
+ "grad_norm": 0.796875,
43
+ "learning_rate": 4.9350649350649355e-05,
44
+ "loss": 1.7012,
45
+ "step": 20
46
+ },
47
+ {
48
+ "epoch": 0.09852216748768473,
49
+ "grad_norm": 0.59765625,
50
+ "learning_rate": 6.233766233766233e-05,
51
+ "loss": 1.6683,
52
+ "step": 25
53
+ },
54
+ {
55
+ "epoch": 0.11822660098522167,
56
+ "grad_norm": 0.4765625,
57
+ "learning_rate": 7.532467532467533e-05,
58
+ "loss": 1.6283,
59
+ "step": 30
60
+ },
61
+ {
62
+ "epoch": 0.13793103448275862,
63
+ "grad_norm": 0.341796875,
64
+ "learning_rate": 8.831168831168831e-05,
65
+ "loss": 1.5815,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 0.15763546798029557,
70
+ "grad_norm": 0.2578125,
71
+ "learning_rate": 0.0001012987012987013,
72
+ "loss": 1.5755,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 0.17733990147783252,
77
+ "grad_norm": 0.2177734375,
78
+ "learning_rate": 0.00011428571428571428,
79
+ "loss": 1.5314,
80
+ "step": 45
81
+ },
82
+ {
83
+ "epoch": 0.19704433497536947,
84
+ "grad_norm": 0.185546875,
85
+ "learning_rate": 0.00012727272727272728,
86
+ "loss": 1.5258,
87
+ "step": 50
88
+ },
89
+ {
90
+ "epoch": 0.21674876847290642,
91
+ "grad_norm": 0.1611328125,
92
+ "learning_rate": 0.00014025974025974028,
93
+ "loss": 1.5131,
94
+ "step": 55
95
+ },
96
+ {
97
+ "epoch": 0.23645320197044334,
98
+ "grad_norm": 0.1484375,
99
+ "learning_rate": 0.00015324675324675325,
100
+ "loss": 1.498,
101
+ "step": 60
102
+ },
103
+ {
104
+ "epoch": 0.2561576354679803,
105
+ "grad_norm": 0.1455078125,
106
+ "learning_rate": 0.00016623376623376625,
107
+ "loss": 1.4953,
108
+ "step": 65
109
+ },
110
+ {
111
+ "epoch": 0.27586206896551724,
112
+ "grad_norm": 0.1318359375,
113
+ "learning_rate": 0.00017922077922077922,
114
+ "loss": 1.4889,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 0.2955665024630542,
119
+ "grad_norm": 0.1279296875,
120
+ "learning_rate": 0.00019220779220779222,
121
+ "loss": 1.4768,
122
+ "step": 75
123
+ },
124
+ {
125
+ "epoch": 0.31527093596059114,
126
+ "grad_norm": 0.12158203125,
127
+ "learning_rate": 0.00019999579326114222,
128
+ "loss": 1.4826,
129
+ "step": 80
130
+ },
131
+ {
132
+ "epoch": 0.33497536945812806,
133
+ "grad_norm": 0.1259765625,
134
+ "learning_rate": 0.00019994847151359784,
135
+ "loss": 1.4514,
136
+ "step": 85
137
+ },
138
+ {
139
+ "epoch": 0.35467980295566504,
140
+ "grad_norm": 0.115234375,
141
+ "learning_rate": 0.0001998485945607536,
142
+ "loss": 1.469,
143
+ "step": 90
144
+ },
145
+ {
146
+ "epoch": 0.37438423645320196,
147
+ "grad_norm": 0.11865234375,
148
+ "learning_rate": 0.00019969621492020869,
149
+ "loss": 1.4381,
150
+ "step": 95
151
+ },
152
+ {
153
+ "epoch": 0.39408866995073893,
154
+ "grad_norm": 0.12890625,
155
+ "learning_rate": 0.00019949141271668306,
156
+ "loss": 1.4513,
157
+ "step": 100
158
+ },
159
+ {
160
+ "epoch": 0.41379310344827586,
161
+ "grad_norm": 0.107421875,
162
+ "learning_rate": 0.00019923429563988614,
163
+ "loss": 1.4403,
164
+ "step": 105
165
+ },
166
+ {
167
+ "epoch": 0.43349753694581283,
168
+ "grad_norm": 0.1083984375,
169
+ "learning_rate": 0.00019892499888789098,
170
+ "loss": 1.4521,
171
+ "step": 110
172
+ },
173
+ {
174
+ "epoch": 0.45320197044334976,
175
+ "grad_norm": 0.12158203125,
176
+ "learning_rate": 0.00019856368509604412,
177
+ "loss": 1.4494,
178
+ "step": 115
179
+ },
180
+ {
181
+ "epoch": 0.4729064039408867,
182
+ "grad_norm": 0.115234375,
183
+ "learning_rate": 0.00019815054425144815,
184
+ "loss": 1.4289,
185
+ "step": 120
186
+ },
187
+ {
188
+ "epoch": 0.49261083743842365,
189
+ "grad_norm": 0.1103515625,
190
+ "learning_rate": 0.00019768579359306205,
191
+ "loss": 1.4261,
192
+ "step": 125
193
+ },
194
+ {
195
+ "epoch": 0.5123152709359606,
196
+ "grad_norm": 0.1123046875,
197
+ "learning_rate": 0.00019716967749747207,
198
+ "loss": 1.4212,
199
+ "step": 130
200
+ },
201
+ {
202
+ "epoch": 0.5320197044334976,
203
+ "grad_norm": 0.1044921875,
204
+ "learning_rate": 0.00019660246735039266,
205
+ "loss": 1.4517,
206
+ "step": 135
207
+ },
208
+ {
209
+ "epoch": 0.5517241379310345,
210
+ "grad_norm": 0.1171875,
211
+ "learning_rate": 0.00019598446140396605,
212
+ "loss": 1.4208,
213
+ "step": 140
214
+ },
215
+ {
216
+ "epoch": 0.5714285714285714,
217
+ "grad_norm": 0.1103515625,
218
+ "learning_rate": 0.00019531598461993392,
219
+ "loss": 1.4357,
220
+ "step": 145
221
+ },
222
+ {
223
+ "epoch": 0.5911330049261084,
224
+ "grad_norm": 0.1015625,
225
+ "learning_rate": 0.00019459738849876543,
226
+ "loss": 1.4108,
227
+ "step": 150
228
+ },
229
+ {
230
+ "epoch": 0.6108374384236454,
231
+ "grad_norm": 0.11376953125,
232
+ "learning_rate": 0.00019382905089482995,
233
+ "loss": 1.4396,
234
+ "step": 155
235
+ },
236
+ {
237
+ "epoch": 0.6305418719211823,
238
+ "grad_norm": 0.0986328125,
239
+ "learning_rate": 0.00019301137581771266,
240
+ "loss": 1.4114,
241
+ "step": 160
242
+ },
243
+ {
244
+ "epoch": 0.6502463054187192,
245
+ "grad_norm": 0.10302734375,
246
+ "learning_rate": 0.00019214479321977697,
247
+ "loss": 1.4221,
248
+ "step": 165
249
+ },
250
+ {
251
+ "epoch": 0.6699507389162561,
252
+ "grad_norm": 0.10400390625,
253
+ "learning_rate": 0.00019122975877008567,
254
+ "loss": 1.4234,
255
+ "step": 170
256
+ },
257
+ {
258
+ "epoch": 0.6896551724137931,
259
+ "grad_norm": 0.1044921875,
260
+ "learning_rate": 0.00019026675361479969,
261
+ "loss": 1.4378,
262
+ "step": 175
263
+ },
264
+ {
265
+ "epoch": 0.7093596059113301,
266
+ "grad_norm": 0.10400390625,
267
+ "learning_rate": 0.0001892562841241804,
268
+ "loss": 1.4178,
269
+ "step": 180
270
+ },
271
+ {
272
+ "epoch": 0.729064039408867,
273
+ "grad_norm": 0.10693359375,
274
+ "learning_rate": 0.00018819888162632838,
275
+ "loss": 1.4221,
276
+ "step": 185
277
+ },
278
+ {
279
+ "epoch": 0.7487684729064039,
280
+ "grad_norm": 0.1005859375,
281
+ "learning_rate": 0.00018709510212779903,
282
+ "loss": 1.4267,
283
+ "step": 190
284
+ },
285
+ {
286
+ "epoch": 0.7684729064039408,
287
+ "grad_norm": 0.107421875,
288
+ "learning_rate": 0.0001859455260212414,
289
+ "loss": 1.4226,
290
+ "step": 195
291
+ },
292
+ {
293
+ "epoch": 0.7881773399014779,
294
+ "grad_norm": 0.10986328125,
295
+ "learning_rate": 0.00018475075778021438,
296
+ "loss": 1.4328,
297
+ "step": 200
298
+ },
299
+ {
300
+ "epoch": 0.7881773399014779,
301
+ "eval_loss": 1.4307746887207031,
302
+ "eval_runtime": 143.9646,
303
+ "eval_samples_per_second": 50.116,
304
+ "eval_steps_per_second": 3.133,
305
+ "step": 200
306
+ },
307
+ {
308
+ "epoch": 0.8078817733990148,
309
+ "grad_norm": 0.10009765625,
310
+ "learning_rate": 0.00018351142564134078,
311
+ "loss": 1.4341,
312
+ "step": 205
313
+ },
314
+ {
315
+ "epoch": 0.8275862068965517,
316
+ "grad_norm": 0.0966796875,
317
+ "learning_rate": 0.0001822281812739659,
318
+ "loss": 1.4172,
319
+ "step": 210
320
+ },
321
+ {
322
+ "epoch": 0.8472906403940886,
323
+ "grad_norm": 0.10009765625,
324
+ "learning_rate": 0.00018090169943749476,
325
+ "loss": 1.41,
326
+ "step": 215
327
+ },
328
+ {
329
+ "epoch": 0.8669950738916257,
330
+ "grad_norm": 0.10400390625,
331
+ "learning_rate": 0.00017953267762658827,
332
+ "loss": 1.4099,
333
+ "step": 220
334
+ },
335
+ {
336
+ "epoch": 0.8866995073891626,
337
+ "grad_norm": 0.099609375,
338
+ "learning_rate": 0.00017812183570440428,
339
+ "loss": 1.4176,
340
+ "step": 225
341
+ },
342
+ {
343
+ "epoch": 0.9064039408866995,
344
+ "grad_norm": 0.10546875,
345
+ "learning_rate": 0.00017666991552407724,
346
+ "loss": 1.4101,
347
+ "step": 230
348
+ },
349
+ {
350
+ "epoch": 0.9261083743842364,
351
+ "grad_norm": 0.1044921875,
352
+ "learning_rate": 0.0001751776805386344,
353
+ "loss": 1.4029,
354
+ "step": 235
355
+ },
356
+ {
357
+ "epoch": 0.9458128078817734,
358
+ "grad_norm": 0.10009765625,
359
+ "learning_rate": 0.000173645915399555,
360
+ "loss": 1.3944,
361
+ "step": 240
362
+ },
363
+ {
364
+ "epoch": 0.9655172413793104,
365
+ "grad_norm": 0.0986328125,
366
+ "learning_rate": 0.00017207542554418227,
367
+ "loss": 1.4001,
368
+ "step": 245
369
+ },
370
+ {
371
+ "epoch": 0.9852216748768473,
372
+ "grad_norm": 0.103515625,
373
+ "learning_rate": 0.000170467036772206,
374
+ "loss": 1.4076,
375
+ "step": 250
376
+ },
377
+ {
378
+ "epoch": 1.0039408866995074,
379
+ "grad_norm": 0.10009765625,
380
+ "learning_rate": 0.00016882159481143802,
381
+ "loss": 1.4025,
382
+ "step": 255
383
+ },
384
+ {
385
+ "epoch": 1.0236453201970444,
386
+ "grad_norm": 0.09423828125,
387
+ "learning_rate": 0.00016713996487310916,
388
+ "loss": 1.405,
389
+ "step": 260
390
+ },
391
+ {
392
+ "epoch": 1.0433497536945813,
393
+ "grad_norm": 0.09619140625,
394
+ "learning_rate": 0.00016542303119692129,
395
+ "loss": 1.4017,
396
+ "step": 265
397
+ },
398
+ {
399
+ "epoch": 1.0630541871921182,
400
+ "grad_norm": 0.10107421875,
401
+ "learning_rate": 0.00016367169658609355,
402
+ "loss": 1.4038,
403
+ "step": 270
404
+ },
405
+ {
406
+ "epoch": 1.0827586206896551,
407
+ "grad_norm": 0.0966796875,
408
+ "learning_rate": 0.0001618868819326479,
409
+ "loss": 1.3942,
410
+ "step": 275
411
+ },
412
+ {
413
+ "epoch": 1.102463054187192,
414
+ "grad_norm": 0.10595703125,
415
+ "learning_rate": 0.00016006952573318278,
416
+ "loss": 1.3961,
417
+ "step": 280
418
+ },
419
+ {
420
+ "epoch": 1.1221674876847292,
421
+ "grad_norm": 0.10107421875,
422
+ "learning_rate": 0.00015822058359539002,
423
+ "loss": 1.4007,
424
+ "step": 285
425
+ },
426
+ {
427
+ "epoch": 1.141871921182266,
428
+ "grad_norm": 0.10009765625,
429
+ "learning_rate": 0.0001563410277355743,
430
+ "loss": 1.4071,
431
+ "step": 290
432
+ },
433
+ {
434
+ "epoch": 1.161576354679803,
435
+ "grad_norm": 0.09814453125,
436
+ "learning_rate": 0.0001544318464674397,
437
+ "loss": 1.385,
438
+ "step": 295
439
+ },
440
+ {
441
+ "epoch": 1.18128078817734,
442
+ "grad_norm": 0.0947265625,
443
+ "learning_rate": 0.00015249404368241116,
444
+ "loss": 1.3933,
445
+ "step": 300
446
+ },
447
+ {
448
+ "epoch": 1.2009852216748769,
449
+ "grad_norm": 0.09423828125,
450
+ "learning_rate": 0.0001505286383217657,
451
+ "loss": 1.3886,
452
+ "step": 305
453
+ },
454
+ {
455
+ "epoch": 1.2206896551724138,
456
+ "grad_norm": 0.10302734375,
457
+ "learning_rate": 0.0001485366638408496,
458
+ "loss": 1.387,
459
+ "step": 310
460
+ },
461
+ {
462
+ "epoch": 1.2403940886699507,
463
+ "grad_norm": 0.107421875,
464
+ "learning_rate": 0.0001465191676656634,
465
+ "loss": 1.3949,
466
+ "step": 315
467
+ },
468
+ {
469
+ "epoch": 1.2600985221674876,
470
+ "grad_norm": 0.09423828125,
471
+ "learning_rate": 0.00014447721064210186,
472
+ "loss": 1.3927,
473
+ "step": 320
474
+ },
475
+ {
476
+ "epoch": 1.2798029556650246,
477
+ "grad_norm": 0.09619140625,
478
+ "learning_rate": 0.00014241186647813626,
479
+ "loss": 1.4009,
480
+ "step": 325
481
+ },
482
+ {
483
+ "epoch": 1.2995073891625615,
484
+ "grad_norm": 0.091796875,
485
+ "learning_rate": 0.00014032422117923426,
486
+ "loss": 1.3838,
487
+ "step": 330
488
+ },
489
+ {
490
+ "epoch": 1.3192118226600984,
491
+ "grad_norm": 0.0947265625,
492
+ "learning_rate": 0.00013821537247731336,
493
+ "loss": 1.3958,
494
+ "step": 335
495
+ },
496
+ {
497
+ "epoch": 1.3389162561576355,
498
+ "grad_norm": 0.0947265625,
499
+ "learning_rate": 0.00013608642925352793,
500
+ "loss": 1.4111,
501
+ "step": 340
502
+ },
503
+ {
504
+ "epoch": 1.3586206896551725,
505
+ "grad_norm": 0.0966796875,
506
+ "learning_rate": 0.00013393851095519423,
507
+ "loss": 1.4007,
508
+ "step": 345
509
+ },
510
+ {
511
+ "epoch": 1.3783251231527094,
512
+ "grad_norm": 0.09521484375,
513
+ "learning_rate": 0.00013177274700715914,
514
+ "loss": 1.3933,
515
+ "step": 350
516
+ },
517
+ {
518
+ "epoch": 1.3980295566502463,
519
+ "grad_norm": 0.09814453125,
520
+ "learning_rate": 0.00012959027621792265,
521
+ "loss": 1.3997,
522
+ "step": 355
523
+ },
524
+ {
525
+ "epoch": 1.4177339901477832,
526
+ "grad_norm": 0.09716796875,
527
+ "learning_rate": 0.00012739224618082612,
528
+ "loss": 1.4075,
529
+ "step": 360
530
+ },
531
+ {
532
+ "epoch": 1.4374384236453202,
533
+ "grad_norm": 0.1064453125,
534
+ "learning_rate": 0.00012517981267062134,
535
+ "loss": 1.3927,
536
+ "step": 365
537
+ },
538
+ {
539
+ "epoch": 1.457142857142857,
540
+ "grad_norm": 0.095703125,
541
+ "learning_rate": 0.00012295413903573756,
542
+ "loss": 1.3956,
543
+ "step": 370
544
+ },
545
+ {
546
+ "epoch": 1.4768472906403942,
547
+ "grad_norm": 0.09423828125,
548
+ "learning_rate": 0.00012071639558656614,
549
+ "loss": 1.4016,
550
+ "step": 375
551
+ },
552
+ {
553
+ "epoch": 1.4965517241379311,
554
+ "grad_norm": 0.0947265625,
555
+ "learning_rate": 0.00011846775898008438,
556
+ "loss": 1.3889,
557
+ "step": 380
558
+ },
559
+ {
560
+ "epoch": 1.516256157635468,
561
+ "grad_norm": 0.0986328125,
562
+ "learning_rate": 0.00011620941160114229,
563
+ "loss": 1.3962,
564
+ "step": 385
565
+ },
566
+ {
567
+ "epoch": 1.535960591133005,
568
+ "grad_norm": 0.095703125,
569
+ "learning_rate": 0.0001139425409407374,
570
+ "loss": 1.3996,
571
+ "step": 390
572
+ },
573
+ {
574
+ "epoch": 1.555665024630542,
575
+ "grad_norm": 0.09228515625,
576
+ "learning_rate": 0.00011166833897160465,
577
+ "loss": 1.4013,
578
+ "step": 395
579
+ },
580
+ {
581
+ "epoch": 1.5753694581280788,
582
+ "grad_norm": 0.09814453125,
583
+ "learning_rate": 0.00010938800152144984,
584
+ "loss": 1.3788,
585
+ "step": 400
586
+ },
587
+ {
588
+ "epoch": 1.5753694581280788,
589
+ "eval_loss": 1.4156588315963745,
590
+ "eval_runtime": 144.1156,
591
+ "eval_samples_per_second": 50.064,
592
+ "eval_steps_per_second": 3.129,
593
+ "step": 400
594
+ },
595
+ {
596
+ "epoch": 1.5950738916256157,
597
+ "grad_norm": 0.09521484375,
598
+ "learning_rate": 0.00010710272764415566,
599
+ "loss": 1.3997,
600
+ "step": 405
601
+ },
602
+ {
603
+ "epoch": 1.6147783251231527,
604
+ "grad_norm": 0.09375,
605
+ "learning_rate": 0.00010481371898929186,
606
+ "loss": 1.3844,
607
+ "step": 410
608
+ },
609
+ {
610
+ "epoch": 1.6344827586206896,
611
+ "grad_norm": 0.095703125,
612
+ "learning_rate": 0.0001025221791702601,
613
+ "loss": 1.4087,
614
+ "step": 415
615
+ },
616
+ {
617
+ "epoch": 1.6541871921182265,
618
+ "grad_norm": 0.09765625,
619
+ "learning_rate": 0.00010022931313140638,
620
+ "loss": 1.3909,
621
+ "step": 420
622
+ },
623
+ {
624
+ "epoch": 1.6738916256157634,
625
+ "grad_norm": 0.091796875,
626
+ "learning_rate": 9.793632651443357e-05,
627
+ "loss": 1.3925,
628
+ "step": 425
629
+ },
630
+ {
631
+ "epoch": 1.6935960591133004,
632
+ "grad_norm": 0.09375,
633
+ "learning_rate": 9.564442502444735e-05,
634
+ "loss": 1.3906,
635
+ "step": 430
636
+ },
637
+ {
638
+ "epoch": 1.7133004926108373,
639
+ "grad_norm": 0.09521484375,
640
+ "learning_rate": 9.33548137959686e-05,
641
+ "loss": 1.4122,
642
+ "step": 435
643
+ },
644
+ {
645
+ "epoch": 1.7330049261083744,
646
+ "grad_norm": 0.1025390625,
647
+ "learning_rate": 9.106869675924605e-05,
648
+ "loss": 1.3932,
649
+ "step": 440
650
+ },
651
+ {
652
+ "epoch": 1.7527093596059113,
653
+ "grad_norm": 0.0947265625,
654
+ "learning_rate": 8.878727600720207e-05,
655
+ "loss": 1.3974,
656
+ "step": 445
657
+ },
658
+ {
659
+ "epoch": 1.7724137931034483,
660
+ "grad_norm": 0.0927734375,
661
+ "learning_rate": 8.651175116334443e-05,
662
+ "loss": 1.3872,
663
+ "step": 450
664
+ },
665
+ {
666
+ "epoch": 1.7921182266009852,
667
+ "grad_norm": 0.09619140625,
668
+ "learning_rate": 8.424331875097688e-05,
669
+ "loss": 1.4025,
670
+ "step": 455
671
+ },
672
+ {
673
+ "epoch": 1.8118226600985223,
674
+ "grad_norm": 0.0966796875,
675
+ "learning_rate": 8.19831715640394e-05,
676
+ "loss": 1.4054,
677
+ "step": 460
678
+ },
679
+ {
680
+ "epoch": 1.8315270935960593,
681
+ "grad_norm": 0.09326171875,
682
+ "learning_rate": 7.973249803991006e-05,
683
+ "loss": 1.3878,
684
+ "step": 465
685
+ },
686
+ {
687
+ "epoch": 1.8512315270935962,
688
+ "grad_norm": 0.0986328125,
689
+ "learning_rate": 7.749248163449693e-05,
690
+ "loss": 1.3862,
691
+ "step": 470
692
+ },
693
+ {
694
+ "epoch": 1.870935960591133,
695
+ "grad_norm": 0.099609375,
696
+ "learning_rate": 7.526430019995001e-05,
697
+ "loss": 1.4118,
698
+ "step": 475
699
+ },
700
+ {
701
+ "epoch": 1.89064039408867,
702
+ "grad_norm": 0.09326171875,
703
+ "learning_rate": 7.304912536531944e-05,
704
+ "loss": 1.3954,
705
+ "step": 480
706
+ },
707
+ {
708
+ "epoch": 1.910344827586207,
709
+ "grad_norm": 0.09130859375,
710
+ "learning_rate": 7.084812192048594e-05,
711
+ "loss": 1.4003,
712
+ "step": 485
713
+ },
714
+ {
715
+ "epoch": 1.9300492610837439,
716
+ "grad_norm": 0.1005859375,
717
+ "learning_rate": 6.866244720368737e-05,
718
+ "loss": 1.3998,
719
+ "step": 490
720
+ },
721
+ {
722
+ "epoch": 1.9497536945812808,
723
+ "grad_norm": 0.09423828125,
724
+ "learning_rate": 6.6493250492964e-05,
725
+ "loss": 1.3837,
726
+ "step": 495
727
+ },
728
+ {
729
+ "epoch": 1.9694581280788177,
730
+ "grad_norm": 0.09326171875,
731
+ "learning_rate": 6.434167240184135e-05,
732
+ "loss": 1.411,
733
+ "step": 500
734
+ },
735
+ {
736
+ "epoch": 1.9891625615763546,
737
+ "grad_norm": 0.09228515625,
738
+ "learning_rate": 6.220884427956953e-05,
739
+ "loss": 1.3949,
740
+ "step": 505
741
+ },
742
+ {
743
+ "epoch": 2.007881773399015,
744
+ "grad_norm": 0.0927734375,
745
+ "learning_rate": 6.0095887616233796e-05,
746
+ "loss": 1.3871,
747
+ "step": 510
748
+ },
749
+ {
750
+ "epoch": 2.027586206896552,
751
+ "grad_norm": 0.09228515625,
752
+ "learning_rate": 5.800391345304914e-05,
753
+ "loss": 1.3871,
754
+ "step": 515
755
+ },
756
+ {
757
+ "epoch": 2.0472906403940887,
758
+ "grad_norm": 0.09033203125,
759
+ "learning_rate": 5.593402179814944e-05,
760
+ "loss": 1.3887,
761
+ "step": 520
762
+ },
763
+ {
764
+ "epoch": 2.0669950738916256,
765
+ "grad_norm": 0.0927734375,
766
+ "learning_rate": 5.388730104817769e-05,
767
+ "loss": 1.3913,
768
+ "step": 525
769
+ },
770
+ {
771
+ "epoch": 2.0866995073891625,
772
+ "grad_norm": 0.095703125,
773
+ "learning_rate": 5.18648274159821e-05,
774
+ "loss": 1.3854,
775
+ "step": 530
776
+ },
777
+ {
778
+ "epoch": 2.1064039408866995,
779
+ "grad_norm": 0.09326171875,
780
+ "learning_rate": 4.9867664364718725e-05,
781
+ "loss": 1.3915,
782
+ "step": 535
783
+ },
784
+ {
785
+ "epoch": 2.1261083743842364,
786
+ "grad_norm": 0.09033203125,
787
+ "learning_rate": 4.7896862048657965e-05,
788
+ "loss": 1.3917,
789
+ "step": 540
790
+ },
791
+ {
792
+ "epoch": 2.1458128078817733,
793
+ "grad_norm": 0.095703125,
794
+ "learning_rate": 4.595345676098923e-05,
795
+ "loss": 1.3855,
796
+ "step": 545
797
+ },
798
+ {
799
+ "epoch": 2.1655172413793102,
800
+ "grad_norm": 0.09130859375,
801
+ "learning_rate": 4.403847038891424e-05,
802
+ "loss": 1.3838,
803
+ "step": 550
804
+ },
805
+ {
806
+ "epoch": 2.185221674876847,
807
+ "grad_norm": 0.09033203125,
808
+ "learning_rate": 4.2152909876315316e-05,
809
+ "loss": 1.3919,
810
+ "step": 555
811
+ },
812
+ {
813
+ "epoch": 2.204926108374384,
814
+ "grad_norm": 0.0966796875,
815
+ "learning_rate": 4.0297766694280915e-05,
816
+ "loss": 1.3979,
817
+ "step": 560
818
+ },
819
+ {
820
+ "epoch": 2.224630541871921,
821
+ "grad_norm": 0.091796875,
822
+ "learning_rate": 3.8474016319767435e-05,
823
+ "loss": 1.3865,
824
+ "step": 565
825
+ },
826
+ {
827
+ "epoch": 2.2443349753694584,
828
+ "grad_norm": 0.0986328125,
829
+ "learning_rate": 3.6682617722671096e-05,
830
+ "loss": 1.3903,
831
+ "step": 570
832
+ },
833
+ {
834
+ "epoch": 2.264039408866995,
835
+ "grad_norm": 0.09033203125,
836
+ "learning_rate": 3.4924512861579315e-05,
837
+ "loss": 1.3841,
838
+ "step": 575
839
+ },
840
+ {
841
+ "epoch": 2.283743842364532,
842
+ "grad_norm": 0.09375,
843
+ "learning_rate": 3.3200626188467344e-05,
844
+ "loss": 1.3965,
845
+ "step": 580
846
+ },
847
+ {
848
+ "epoch": 2.303448275862069,
849
+ "grad_norm": 0.08984375,
850
+ "learning_rate": 3.151186416260006e-05,
851
+ "loss": 1.4112,
852
+ "step": 585
853
+ },
854
+ {
855
+ "epoch": 2.323152709359606,
856
+ "grad_norm": 0.09521484375,
857
+ "learning_rate": 2.9859114773895025e-05,
858
+ "loss": 1.385,
859
+ "step": 590
860
+ },
861
+ {
862
+ "epoch": 2.342857142857143,
863
+ "grad_norm": 0.0986328125,
864
+ "learning_rate": 2.8243247075996693e-05,
865
+ "loss": 1.3838,
866
+ "step": 595
867
+ },
868
+ {
869
+ "epoch": 2.36256157635468,
870
+ "grad_norm": 0.0927734375,
871
+ "learning_rate": 2.6665110729308263e-05,
872
+ "loss": 1.3938,
873
+ "step": 600
874
+ },
875
+ {
876
+ "epoch": 2.36256157635468,
877
+ "eval_loss": 1.4132238626480103,
878
+ "eval_runtime": 144.0354,
879
+ "eval_samples_per_second": 50.092,
880
+ "eval_steps_per_second": 3.131,
881
+ "step": 600
882
+ },
883
+ {
884
+ "epoch": 2.382266009852217,
885
+ "grad_norm": 0.09423828125,
886
+ "learning_rate": 2.5125535554220482e-05,
887
+ "loss": 1.3974,
888
+ "step": 605
889
+ },
890
+ {
891
+ "epoch": 2.4019704433497537,
892
+ "grad_norm": 0.09375,
893
+ "learning_rate": 2.3625331094773206e-05,
894
+ "loss": 1.3814,
895
+ "step": 610
896
+ },
897
+ {
898
+ "epoch": 2.4216748768472907,
899
+ "grad_norm": 0.091796875,
900
+ "learning_rate": 2.2165286192978342e-05,
901
+ "loss": 1.3858,
902
+ "step": 615
903
+ },
904
+ {
905
+ "epoch": 2.4413793103448276,
906
+ "grad_norm": 0.0947265625,
907
+ "learning_rate": 2.074616857402867e-05,
908
+ "loss": 1.3804,
909
+ "step": 620
910
+ },
911
+ {
912
+ "epoch": 2.4610837438423645,
913
+ "grad_norm": 0.0927734375,
914
+ "learning_rate": 1.936872444261022e-05,
915
+ "loss": 1.3868,
916
+ "step": 625
917
+ },
918
+ {
919
+ "epoch": 2.4807881773399014,
920
+ "grad_norm": 0.09228515625,
921
+ "learning_rate": 1.8033678090530813e-05,
922
+ "loss": 1.3923,
923
+ "step": 630
924
+ },
925
+ {
926
+ "epoch": 2.5004926108374383,
927
+ "grad_norm": 0.091796875,
928
+ "learning_rate": 1.6741731515870594e-05,
929
+ "loss": 1.3889,
930
+ "step": 635
931
+ },
932
+ {
933
+ "epoch": 2.5201970443349753,
934
+ "grad_norm": 0.09814453125,
935
+ "learning_rate": 1.549356405385538e-05,
936
+ "loss": 1.3736,
937
+ "step": 640
938
+ },
939
+ {
940
+ "epoch": 2.539901477832512,
941
+ "grad_norm": 0.0947265625,
942
+ "learning_rate": 1.428983201964662e-05,
943
+ "loss": 1.3955,
944
+ "step": 645
945
+ },
946
+ {
947
+ "epoch": 2.559605911330049,
948
+ "grad_norm": 0.09375,
949
+ "learning_rate": 1.313116836323568e-05,
950
+ "loss": 1.3946,
951
+ "step": 650
952
+ },
953
+ {
954
+ "epoch": 2.5793103448275865,
955
+ "grad_norm": 0.09521484375,
956
+ "learning_rate": 1.2018182336624273e-05,
957
+ "loss": 1.3907,
958
+ "step": 655
959
+ },
960
+ {
961
+ "epoch": 2.599014778325123,
962
+ "grad_norm": 0.09326171875,
963
+ "learning_rate": 1.0951459173465629e-05,
964
+ "loss": 1.4041,
965
+ "step": 660
966
+ },
967
+ {
968
+ "epoch": 2.6187192118226603,
969
+ "grad_norm": 0.09423828125,
970
+ "learning_rate": 9.93155978133541e-06,
971
+ "loss": 1.391,
972
+ "step": 665
973
+ },
974
+ {
975
+ "epoch": 2.638423645320197,
976
+ "grad_norm": 0.09033203125,
977
+ "learning_rate": 8.959020446793288e-06,
978
+ "loss": 1.3882,
979
+ "step": 670
980
+ },
981
+ {
982
+ "epoch": 2.658128078817734,
983
+ "grad_norm": 0.09375,
984
+ "learning_rate": 8.034352553391367e-06,
985
+ "loss": 1.4001,
986
+ "step": 675
987
+ },
988
+ {
989
+ "epoch": 2.677832512315271,
990
+ "grad_norm": 0.0927734375,
991
+ "learning_rate": 7.158042312776847e-06,
992
+ "loss": 1.3824,
993
+ "step": 680
994
+ },
995
+ {
996
+ "epoch": 2.697536945812808,
997
+ "grad_norm": 0.0908203125,
998
+ "learning_rate": 6.330550509030852e-06,
999
+ "loss": 1.379,
1000
+ "step": 685
1001
+ },
1002
+ {
1003
+ "epoch": 2.717241379310345,
1004
+ "grad_norm": 0.08984375,
1005
+ "learning_rate": 5.552312256377423e-06,
1006
+ "loss": 1.3787,
1007
+ "step": 690
1008
+ },
1009
+ {
1010
+ "epoch": 2.736945812807882,
1011
+ "grad_norm": 0.09228515625,
1012
+ "learning_rate": 4.823736770390552e-06,
1013
+ "loss": 1.3902,
1014
+ "step": 695
1015
+ },
1016
+ {
1017
+ "epoch": 2.7566502463054188,
1018
+ "grad_norm": 0.0927734375,
1019
+ "learning_rate": 4.14520715281923e-06,
1020
+ "loss": 1.3991,
1021
+ "step": 700
1022
+ },
1023
+ {
1024
+ "epoch": 2.7763546798029557,
1025
+ "grad_norm": 0.09375,
1026
+ "learning_rate": 3.517080190143629e-06,
1027
+ "loss": 1.3866,
1028
+ "step": 705
1029
+ },
1030
+ {
1031
+ "epoch": 2.7960591133004926,
1032
+ "grad_norm": 0.09033203125,
1033
+ "learning_rate": 2.9396861659686915e-06,
1034
+ "loss": 1.3864,
1035
+ "step": 710
1036
+ },
1037
+ {
1038
+ "epoch": 2.8157635467980295,
1039
+ "grad_norm": 0.0927734375,
1040
+ "learning_rate": 2.4133286873533112e-06,
1041
+ "loss": 1.373,
1042
+ "step": 715
1043
+ },
1044
+ {
1045
+ "epoch": 2.8354679802955665,
1046
+ "grad_norm": 0.0966796875,
1047
+ "learning_rate": 1.9382845251668335e-06,
1048
+ "loss": 1.384,
1049
+ "step": 720
1050
+ },
1051
+ {
1052
+ "epoch": 2.8551724137931034,
1053
+ "grad_norm": 0.091796875,
1054
+ "learning_rate": 1.514803468556547e-06,
1055
+ "loss": 1.3768,
1056
+ "step": 725
1057
+ },
1058
+ {
1059
+ "epoch": 2.8748768472906403,
1060
+ "grad_norm": 0.0908203125,
1061
+ "learning_rate": 1.14310819360276e-06,
1062
+ "loss": 1.3968,
1063
+ "step": 730
1064
+ },
1065
+ {
1066
+ "epoch": 2.8945812807881772,
1067
+ "grad_norm": 0.09130859375,
1068
+ "learning_rate": 8.233941462306271e-07,
1069
+ "loss": 1.3811,
1070
+ "step": 735
1071
+ },
1072
+ {
1073
+ "epoch": 2.914285714285714,
1074
+ "grad_norm": 0.0908203125,
1075
+ "learning_rate": 5.558294394402253e-07,
1076
+ "loss": 1.4115,
1077
+ "step": 740
1078
+ },
1079
+ {
1080
+ "epoch": 2.933990147783251,
1081
+ "grad_norm": 0.08984375,
1082
+ "learning_rate": 3.405547649087959e-07,
1083
+ "loss": 1.4119,
1084
+ "step": 745
1085
+ },
1086
+ {
1087
+ "epoch": 2.9536945812807884,
1088
+ "grad_norm": 0.09130859375,
1089
+ "learning_rate": 1.7768331901187875e-07,
1090
+ "loss": 1.395,
1091
+ "step": 750
1092
+ },
1093
+ {
1094
+ "epoch": 2.973399014778325,
1095
+ "grad_norm": 0.0947265625,
1096
+ "learning_rate": 6.730074330203451e-08,
1097
+ "loss": 1.3966,
1098
+ "step": 755
1099
+ },
1100
+ {
1101
+ "epoch": 2.9931034482758623,
1102
+ "grad_norm": 0.09228515625,
1103
+ "learning_rate": 9.46507947655606e-09,
1104
+ "loss": 1.3686,
1105
+ "step": 760
1106
+ },
1107
+ {
1108
+ "epoch": 3.0,
1109
+ "step": 762,
1110
+ "total_flos": 7.533943292711404e+17,
1111
+ "train_loss": 1.4212341141200129,
1112
+ "train_runtime": 13229.3679,
1113
+ "train_samples_per_second": 14.73,
1114
+ "train_steps_per_second": 0.058
1115
+ }
1116
+ ],
1117
+ "logging_steps": 5,
1118
+ "max_steps": 762,
1119
+ "num_input_tokens_seen": 0,
1120
+ "num_train_epochs": 3,
1121
+ "save_steps": 1000000,
1122
+ "stateful_callbacks": {
1123
+ "TrainerControl": {
1124
+ "args": {
1125
+ "should_epoch_stop": false,
1126
+ "should_evaluate": false,
1127
+ "should_log": false,
1128
+ "should_save": true,
1129
+ "should_training_stop": true
1130
+ },
1131
+ "attributes": {}
1132
+ }
1133
+ },
1134
+ "total_flos": 7.533943292711404e+17,
1135
+ "train_batch_size": 16,
1136
+ "trial_name": null,
1137
+ "trial_params": null
1138
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:322f1ffde8fdd17fac032eb4de12fa918db83e53d71b410355cd08c604b2f527
3
+ size 6097
vocab.json ADDED
The diff for this file is too large to render. See raw diff