agentlans commited on
Commit
106cd9c
·
verified ·
1 Parent(s): 1531570

Upload 13 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ source.spm filter=lfs diff=lfs merge=lfs -text
37
+ target.spm filter=lfs diff=lfs merge=lfs -text
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_loss": 2.435030460357666,
4
+ "eval_runtime": 2.5197,
5
+ "eval_samples": 2500,
6
+ "eval_samples_per_second": 992.173,
7
+ "eval_steps_per_second": 124.22,
8
+ "num_input_tokens_seen": 55653732,
9
+ "total_flos": 1.4738832163602432e+16,
10
+ "train_loss": 1.7073542784139526,
11
+ "train_runtime": 2504.3889,
12
+ "train_samples": 248732,
13
+ "train_samples_per_second": 297.955,
14
+ "train_steps_per_second": 37.245,
15
+ "train_tokens_per_second": 22216.164
16
+ }
config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Helsinki-NLP/opus-mt-zh-en",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "swish",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "MarianMTModel"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 512,
15
+ "decoder_attention_heads": 8,
16
+ "decoder_ffn_dim": 2048,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 65000,
20
+ "decoder_vocab_size": 65001,
21
+ "dropout": 0.1,
22
+ "encoder_attention_heads": 8,
23
+ "encoder_ffn_dim": 2048,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 0,
27
+ "extra_pos_embeddings": 65001,
28
+ "forced_eos_token_id": 0,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1",
32
+ "2": "LABEL_2"
33
+ },
34
+ "init_std": 0.02,
35
+ "is_encoder_decoder": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": null,
42
+ "max_position_embeddings": 512,
43
+ "model_type": "marian",
44
+ "normalize_before": false,
45
+ "normalize_embedding": false,
46
+ "num_beams": null,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 65000,
49
+ "scale_embedding": true,
50
+ "share_encoder_decoder_embeddings": true,
51
+ "static_position_embeddings": true,
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.48.1",
54
+ "use_cache": true,
55
+ "vocab_size": 65001
56
+ }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_loss": 2.435030460357666,
4
+ "eval_runtime": 2.5197,
5
+ "eval_samples": 2500,
6
+ "eval_samples_per_second": 992.173,
7
+ "eval_steps_per_second": 124.22,
8
+ "num_input_tokens_seen": 55653732
9
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 65000
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 65000,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 6,
13
+ "pad_token_id": 65000,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.48.1"
16
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f894a6d080dfe10bed879a7505ab686823e3d5ba15eed0f44ea70a17462fe4a
3
+ size 309965092
source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e27a3a1b539f4959ec72ea60e453f49156289f95d4e6000b29332efc45616203
3
+ size 804677
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a881f4717cd7265f53fea54fd3dc689c767c05338fac7a4590f3088cb2d7855
3
+ size 806530
tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "65000": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": false,
29
+ "eos_token": "</s>",
30
+ "extra_special_tokens": {},
31
+ "model_max_length": 512,
32
+ "pad_token": "<pad>",
33
+ "separate_vocabs": false,
34
+ "source_lang": "zho",
35
+ "sp_model_kwargs": {},
36
+ "target_lang": "eng",
37
+ "tokenizer_class": "MarianTokenizer",
38
+ "unk_token": "<unk>"
39
+ }
train_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "num_input_tokens_seen": 55653732,
4
+ "total_flos": 1.4738832163602432e+16,
5
+ "train_loss": 1.7073542784139526,
6
+ "train_runtime": 2504.3889,
7
+ "train_samples": 248732,
8
+ "train_samples_per_second": 297.955,
9
+ "train_steps_per_second": 37.245,
10
+ "train_tokens_per_second": 22216.164
11
+ }
trainer_state.json ADDED
@@ -0,0 +1,1865 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 2.435030460357666,
3
+ "best_model_checkpoint": "zhtw-en/checkpoint-92500",
4
+ "epoch": 3.0,
5
+ "eval_steps": 2500,
6
+ "global_step": 93276,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.016081307088640164,
13
+ "grad_norm": 8.550692558288574,
14
+ "learning_rate": 4.919593464556799e-05,
15
+ "loss": 3.4707,
16
+ "num_input_tokens_seen": 301408,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.03216261417728033,
21
+ "grad_norm": 9.999076843261719,
22
+ "learning_rate": 4.8391869291135987e-05,
23
+ "loss": 3.3664,
24
+ "num_input_tokens_seen": 597672,
25
+ "step": 1000
26
+ },
27
+ {
28
+ "epoch": 0.048243921265920496,
29
+ "grad_norm": 8.839406967163086,
30
+ "learning_rate": 4.7587803936703975e-05,
31
+ "loss": 3.3089,
32
+ "num_input_tokens_seen": 896520,
33
+ "step": 1500
34
+ },
35
+ {
36
+ "epoch": 0.06432522835456066,
37
+ "grad_norm": 7.105090618133545,
38
+ "learning_rate": 4.678373858227197e-05,
39
+ "loss": 3.267,
40
+ "num_input_tokens_seen": 1194832,
41
+ "step": 2000
42
+ },
43
+ {
44
+ "epoch": 0.08040653544320082,
45
+ "grad_norm": 7.992733001708984,
46
+ "learning_rate": 4.597967322783996e-05,
47
+ "loss": 3.2254,
48
+ "num_input_tokens_seen": 1493088,
49
+ "step": 2500
50
+ },
51
+ {
52
+ "epoch": 0.08040653544320082,
53
+ "eval_loss": 2.910461664199829,
54
+ "eval_runtime": 2.5005,
55
+ "eval_samples_per_second": 999.788,
56
+ "eval_steps_per_second": 125.173,
57
+ "num_input_tokens_seen": 1493088,
58
+ "step": 2500
59
+ },
60
+ {
61
+ "epoch": 0.09648784253184099,
62
+ "grad_norm": 8.68657112121582,
63
+ "learning_rate": 4.5175607873407955e-05,
64
+ "loss": 3.1691,
65
+ "num_input_tokens_seen": 1793976,
66
+ "step": 3000
67
+ },
68
+ {
69
+ "epoch": 0.11256914962048115,
70
+ "grad_norm": 7.246800899505615,
71
+ "learning_rate": 4.4371542518975943e-05,
72
+ "loss": 3.1685,
73
+ "num_input_tokens_seen": 2095352,
74
+ "step": 3500
75
+ },
76
+ {
77
+ "epoch": 0.1286504567091213,
78
+ "grad_norm": 9.030860900878906,
79
+ "learning_rate": 4.356747716454393e-05,
80
+ "loss": 3.1333,
81
+ "num_input_tokens_seen": 2393856,
82
+ "step": 4000
83
+ },
84
+ {
85
+ "epoch": 0.14473176379776148,
86
+ "grad_norm": 7.463845252990723,
87
+ "learning_rate": 4.276341181011193e-05,
88
+ "loss": 3.1295,
89
+ "num_input_tokens_seen": 2694496,
90
+ "step": 4500
91
+ },
92
+ {
93
+ "epoch": 0.16081307088640165,
94
+ "grad_norm": 8.482089042663574,
95
+ "learning_rate": 4.195934645567992e-05,
96
+ "loss": 3.0946,
97
+ "num_input_tokens_seen": 2990968,
98
+ "step": 5000
99
+ },
100
+ {
101
+ "epoch": 0.16081307088640165,
102
+ "eval_loss": 2.830476999282837,
103
+ "eval_runtime": 2.5201,
104
+ "eval_samples_per_second": 992.009,
105
+ "eval_steps_per_second": 124.199,
106
+ "num_input_tokens_seen": 2990968,
107
+ "step": 5000
108
+ },
109
+ {
110
+ "epoch": 0.17689437797504182,
111
+ "grad_norm": 8.570518493652344,
112
+ "learning_rate": 4.115528110124791e-05,
113
+ "loss": 3.112,
114
+ "num_input_tokens_seen": 3289488,
115
+ "step": 5500
116
+ },
117
+ {
118
+ "epoch": 0.19297568506368198,
119
+ "grad_norm": 9.759325981140137,
120
+ "learning_rate": 4.03512157468159e-05,
121
+ "loss": 3.0933,
122
+ "num_input_tokens_seen": 3590264,
123
+ "step": 6000
124
+ },
125
+ {
126
+ "epoch": 0.20905699215232215,
127
+ "grad_norm": 6.518988609313965,
128
+ "learning_rate": 3.9547150392383896e-05,
129
+ "loss": 3.0858,
130
+ "num_input_tokens_seen": 3885160,
131
+ "step": 6500
132
+ },
133
+ {
134
+ "epoch": 0.2251382992409623,
135
+ "grad_norm": 6.913475036621094,
136
+ "learning_rate": 3.8743085037951885e-05,
137
+ "loss": 3.0543,
138
+ "num_input_tokens_seen": 4184600,
139
+ "step": 7000
140
+ },
141
+ {
142
+ "epoch": 0.24121960632960246,
143
+ "grad_norm": 8.485562324523926,
144
+ "learning_rate": 3.793901968351988e-05,
145
+ "loss": 3.0473,
146
+ "num_input_tokens_seen": 4477792,
147
+ "step": 7500
148
+ },
149
+ {
150
+ "epoch": 0.24121960632960246,
151
+ "eval_loss": 2.773728847503662,
152
+ "eval_runtime": 2.5738,
153
+ "eval_samples_per_second": 971.323,
154
+ "eval_steps_per_second": 121.61,
155
+ "num_input_tokens_seen": 4477792,
156
+ "step": 7500
157
+ },
158
+ {
159
+ "epoch": 0.2573009134182426,
160
+ "grad_norm": 7.89262056350708,
161
+ "learning_rate": 3.713495432908787e-05,
162
+ "loss": 3.029,
163
+ "num_input_tokens_seen": 4779520,
164
+ "step": 8000
165
+ },
166
+ {
167
+ "epoch": 0.2733822205068828,
168
+ "grad_norm": 6.879751205444336,
169
+ "learning_rate": 3.6330888974655864e-05,
170
+ "loss": 3.0127,
171
+ "num_input_tokens_seen": 5078952,
172
+ "step": 8500
173
+ },
174
+ {
175
+ "epoch": 0.28946352759552296,
176
+ "grad_norm": 8.109273910522461,
177
+ "learning_rate": 3.552682362022385e-05,
178
+ "loss": 3.0078,
179
+ "num_input_tokens_seen": 5376128,
180
+ "step": 9000
181
+ },
182
+ {
183
+ "epoch": 0.3055448346841631,
184
+ "grad_norm": 8.074146270751953,
185
+ "learning_rate": 3.472275826579184e-05,
186
+ "loss": 2.9988,
187
+ "num_input_tokens_seen": 5671664,
188
+ "step": 9500
189
+ },
190
+ {
191
+ "epoch": 0.3216261417728033,
192
+ "grad_norm": 6.523529529571533,
193
+ "learning_rate": 3.391869291135984e-05,
194
+ "loss": 2.9633,
195
+ "num_input_tokens_seen": 5967560,
196
+ "step": 10000
197
+ },
198
+ {
199
+ "epoch": 0.3216261417728033,
200
+ "eval_loss": 2.7306864261627197,
201
+ "eval_runtime": 2.505,
202
+ "eval_samples_per_second": 997.986,
203
+ "eval_steps_per_second": 124.948,
204
+ "num_input_tokens_seen": 5967560,
205
+ "step": 10000
206
+ },
207
+ {
208
+ "epoch": 0.33770744886144344,
209
+ "grad_norm": 6.974269866943359,
210
+ "learning_rate": 3.311462755692783e-05,
211
+ "loss": 2.9732,
212
+ "num_input_tokens_seen": 6265312,
213
+ "step": 10500
214
+ },
215
+ {
216
+ "epoch": 0.35378875595008363,
217
+ "grad_norm": 7.644798278808594,
218
+ "learning_rate": 3.231056220249582e-05,
219
+ "loss": 2.9729,
220
+ "num_input_tokens_seen": 6563632,
221
+ "step": 11000
222
+ },
223
+ {
224
+ "epoch": 0.3698700630387238,
225
+ "grad_norm": 7.96437406539917,
226
+ "learning_rate": 3.150649684806381e-05,
227
+ "loss": 2.9484,
228
+ "num_input_tokens_seen": 6865528,
229
+ "step": 11500
230
+ },
231
+ {
232
+ "epoch": 0.38595137012736397,
233
+ "grad_norm": 7.939519882202148,
234
+ "learning_rate": 3.0702431493631805e-05,
235
+ "loss": 2.9387,
236
+ "num_input_tokens_seen": 7165632,
237
+ "step": 12000
238
+ },
239
+ {
240
+ "epoch": 0.4020326772160041,
241
+ "grad_norm": 7.698306083679199,
242
+ "learning_rate": 2.9898366139199797e-05,
243
+ "loss": 2.9355,
244
+ "num_input_tokens_seen": 7463192,
245
+ "step": 12500
246
+ },
247
+ {
248
+ "epoch": 0.4020326772160041,
249
+ "eval_loss": 2.684298515319824,
250
+ "eval_runtime": 2.5214,
251
+ "eval_samples_per_second": 991.52,
252
+ "eval_steps_per_second": 124.138,
253
+ "num_input_tokens_seen": 7463192,
254
+ "step": 12500
255
+ },
256
+ {
257
+ "epoch": 0.4181139843046443,
258
+ "grad_norm": 7.21583890914917,
259
+ "learning_rate": 2.9094300784767786e-05,
260
+ "loss": 2.9418,
261
+ "num_input_tokens_seen": 7758024,
262
+ "step": 13000
263
+ },
264
+ {
265
+ "epoch": 0.43419529139328444,
266
+ "grad_norm": 7.767180919647217,
267
+ "learning_rate": 2.8290235430335778e-05,
268
+ "loss": 2.923,
269
+ "num_input_tokens_seen": 8052032,
270
+ "step": 13500
271
+ },
272
+ {
273
+ "epoch": 0.4502765984819246,
274
+ "grad_norm": 7.057159423828125,
275
+ "learning_rate": 2.7486170075903773e-05,
276
+ "loss": 2.9016,
277
+ "num_input_tokens_seen": 8347768,
278
+ "step": 14000
279
+ },
280
+ {
281
+ "epoch": 0.4663579055705648,
282
+ "grad_norm": 7.320003032684326,
283
+ "learning_rate": 2.6682104721471762e-05,
284
+ "loss": 2.885,
285
+ "num_input_tokens_seen": 8646192,
286
+ "step": 14500
287
+ },
288
+ {
289
+ "epoch": 0.4824392126592049,
290
+ "grad_norm": 7.630561828613281,
291
+ "learning_rate": 2.587803936703975e-05,
292
+ "loss": 2.9076,
293
+ "num_input_tokens_seen": 8950264,
294
+ "step": 15000
295
+ },
296
+ {
297
+ "epoch": 0.4824392126592049,
298
+ "eval_loss": 2.658709764480591,
299
+ "eval_runtime": 2.6566,
300
+ "eval_samples_per_second": 941.035,
301
+ "eval_steps_per_second": 117.818,
302
+ "num_input_tokens_seen": 8950264,
303
+ "step": 15000
304
+ },
305
+ {
306
+ "epoch": 0.4985205197478451,
307
+ "grad_norm": 6.413602828979492,
308
+ "learning_rate": 2.507397401260775e-05,
309
+ "loss": 2.8818,
310
+ "num_input_tokens_seen": 9247272,
311
+ "step": 15500
312
+ },
313
+ {
314
+ "epoch": 0.5146018268364853,
315
+ "grad_norm": 6.833747863769531,
316
+ "learning_rate": 2.426990865817574e-05,
317
+ "loss": 2.8998,
318
+ "num_input_tokens_seen": 9547432,
319
+ "step": 16000
320
+ },
321
+ {
322
+ "epoch": 0.5306831339251254,
323
+ "grad_norm": 7.930506706237793,
324
+ "learning_rate": 2.3465843303743727e-05,
325
+ "loss": 2.8543,
326
+ "num_input_tokens_seen": 9844072,
327
+ "step": 16500
328
+ },
329
+ {
330
+ "epoch": 0.5467644410137656,
331
+ "grad_norm": 6.841344356536865,
332
+ "learning_rate": 2.2661777949311722e-05,
333
+ "loss": 2.8669,
334
+ "num_input_tokens_seen": 10142344,
335
+ "step": 17000
336
+ },
337
+ {
338
+ "epoch": 0.5628457481024057,
339
+ "grad_norm": 6.899343967437744,
340
+ "learning_rate": 2.185771259487971e-05,
341
+ "loss": 2.8714,
342
+ "num_input_tokens_seen": 10443344,
343
+ "step": 17500
344
+ },
345
+ {
346
+ "epoch": 0.5628457481024057,
347
+ "eval_loss": 2.6303601264953613,
348
+ "eval_runtime": 2.5354,
349
+ "eval_samples_per_second": 986.046,
350
+ "eval_steps_per_second": 123.453,
351
+ "num_input_tokens_seen": 10443344,
352
+ "step": 17500
353
+ },
354
+ {
355
+ "epoch": 0.5789270551910459,
356
+ "grad_norm": 6.256689071655273,
357
+ "learning_rate": 2.1053647240447703e-05,
358
+ "loss": 2.8418,
359
+ "num_input_tokens_seen": 10744312,
360
+ "step": 18000
361
+ },
362
+ {
363
+ "epoch": 0.5950083622796861,
364
+ "grad_norm": 7.627821445465088,
365
+ "learning_rate": 2.0249581886015695e-05,
366
+ "loss": 2.8462,
367
+ "num_input_tokens_seen": 11048208,
368
+ "step": 18500
369
+ },
370
+ {
371
+ "epoch": 0.6110896693683262,
372
+ "grad_norm": 7.331953525543213,
373
+ "learning_rate": 1.9445516531583687e-05,
374
+ "loss": 2.8345,
375
+ "num_input_tokens_seen": 11347880,
376
+ "step": 19000
377
+ },
378
+ {
379
+ "epoch": 0.6271709764569664,
380
+ "grad_norm": 6.463207244873047,
381
+ "learning_rate": 1.864145117715168e-05,
382
+ "loss": 2.8531,
383
+ "num_input_tokens_seen": 11650144,
384
+ "step": 19500
385
+ },
386
+ {
387
+ "epoch": 0.6432522835456066,
388
+ "grad_norm": 7.423746109008789,
389
+ "learning_rate": 1.783738582271967e-05,
390
+ "loss": 2.8716,
391
+ "num_input_tokens_seen": 11951096,
392
+ "step": 20000
393
+ },
394
+ {
395
+ "epoch": 0.6432522835456066,
396
+ "eval_loss": 2.6024744510650635,
397
+ "eval_runtime": 2.5182,
398
+ "eval_samples_per_second": 992.755,
399
+ "eval_steps_per_second": 124.293,
400
+ "num_input_tokens_seen": 11951096,
401
+ "step": 20000
402
+ },
403
+ {
404
+ "epoch": 0.6593335906342468,
405
+ "grad_norm": 7.352589130401611,
406
+ "learning_rate": 1.7033320468287664e-05,
407
+ "loss": 2.8243,
408
+ "num_input_tokens_seen": 12252592,
409
+ "step": 20500
410
+ },
411
+ {
412
+ "epoch": 0.6754148977228869,
413
+ "grad_norm": 7.22981071472168,
414
+ "learning_rate": 1.6229255113855656e-05,
415
+ "loss": 2.8454,
416
+ "num_input_tokens_seen": 12546792,
417
+ "step": 21000
418
+ },
419
+ {
420
+ "epoch": 0.6914962048115271,
421
+ "grad_norm": 6.819567680358887,
422
+ "learning_rate": 1.5425189759423648e-05,
423
+ "loss": 2.8047,
424
+ "num_input_tokens_seen": 12838728,
425
+ "step": 21500
426
+ },
427
+ {
428
+ "epoch": 0.7075775119001673,
429
+ "grad_norm": 8.716426849365234,
430
+ "learning_rate": 1.4621124404991638e-05,
431
+ "loss": 2.8144,
432
+ "num_input_tokens_seen": 13137688,
433
+ "step": 22000
434
+ },
435
+ {
436
+ "epoch": 0.7236588189888075,
437
+ "grad_norm": 7.324875831604004,
438
+ "learning_rate": 1.381705905055963e-05,
439
+ "loss": 2.7989,
440
+ "num_input_tokens_seen": 13432464,
441
+ "step": 22500
442
+ },
443
+ {
444
+ "epoch": 0.7236588189888075,
445
+ "eval_loss": 2.5822224617004395,
446
+ "eval_runtime": 2.5158,
447
+ "eval_samples_per_second": 993.705,
448
+ "eval_steps_per_second": 124.412,
449
+ "num_input_tokens_seen": 13432464,
450
+ "step": 22500
451
+ },
452
+ {
453
+ "epoch": 0.7397401260774475,
454
+ "grad_norm": 7.962778568267822,
455
+ "learning_rate": 1.301299369612762e-05,
456
+ "loss": 2.7653,
457
+ "num_input_tokens_seen": 13730024,
458
+ "step": 23000
459
+ },
460
+ {
461
+ "epoch": 0.7558214331660877,
462
+ "grad_norm": 6.807019233703613,
463
+ "learning_rate": 1.2208928341695614e-05,
464
+ "loss": 2.7933,
465
+ "num_input_tokens_seen": 14026400,
466
+ "step": 23500
467
+ },
468
+ {
469
+ "epoch": 0.7719027402547279,
470
+ "grad_norm": 8.716556549072266,
471
+ "learning_rate": 1.1404862987263605e-05,
472
+ "loss": 2.7988,
473
+ "num_input_tokens_seen": 14326608,
474
+ "step": 24000
475
+ },
476
+ {
477
+ "epoch": 0.787984047343368,
478
+ "grad_norm": 7.388988018035889,
479
+ "learning_rate": 1.0600797632831597e-05,
480
+ "loss": 2.7928,
481
+ "num_input_tokens_seen": 14623864,
482
+ "step": 24500
483
+ },
484
+ {
485
+ "epoch": 0.8040653544320082,
486
+ "grad_norm": 7.011099815368652,
487
+ "learning_rate": 9.796732278399589e-06,
488
+ "loss": 2.7941,
489
+ "num_input_tokens_seen": 14919424,
490
+ "step": 25000
491
+ },
492
+ {
493
+ "epoch": 0.8040653544320082,
494
+ "eval_loss": 2.5630149841308594,
495
+ "eval_runtime": 2.5601,
496
+ "eval_samples_per_second": 976.534,
497
+ "eval_steps_per_second": 122.262,
498
+ "num_input_tokens_seen": 14919424,
499
+ "step": 25000
500
+ },
501
+ {
502
+ "epoch": 0.8201466615206484,
503
+ "grad_norm": 6.740393161773682,
504
+ "learning_rate": 8.992666923967581e-06,
505
+ "loss": 2.8089,
506
+ "num_input_tokens_seen": 15216136,
507
+ "step": 25500
508
+ },
509
+ {
510
+ "epoch": 0.8362279686092886,
511
+ "grad_norm": 7.124479293823242,
512
+ "learning_rate": 8.188601569535573e-06,
513
+ "loss": 2.7704,
514
+ "num_input_tokens_seen": 15515592,
515
+ "step": 26000
516
+ },
517
+ {
518
+ "epoch": 0.8523092756979287,
519
+ "grad_norm": 7.781102180480957,
520
+ "learning_rate": 7.384536215103564e-06,
521
+ "loss": 2.8022,
522
+ "num_input_tokens_seen": 15818560,
523
+ "step": 26500
524
+ },
525
+ {
526
+ "epoch": 0.8683905827865689,
527
+ "grad_norm": 6.861135005950928,
528
+ "learning_rate": 6.580470860671556e-06,
529
+ "loss": 2.7891,
530
+ "num_input_tokens_seen": 16114056,
531
+ "step": 27000
532
+ },
533
+ {
534
+ "epoch": 0.8844718898752091,
535
+ "grad_norm": 7.128973484039307,
536
+ "learning_rate": 5.776405506239547e-06,
537
+ "loss": 2.7692,
538
+ "num_input_tokens_seen": 16415080,
539
+ "step": 27500
540
+ },
541
+ {
542
+ "epoch": 0.8844718898752091,
543
+ "eval_loss": 2.5496785640716553,
544
+ "eval_runtime": 2.6422,
545
+ "eval_samples_per_second": 946.175,
546
+ "eval_steps_per_second": 118.461,
547
+ "num_input_tokens_seen": 16415080,
548
+ "step": 27500
549
+ },
550
+ {
551
+ "epoch": 0.9005531969638492,
552
+ "grad_norm": 8.560084342956543,
553
+ "learning_rate": 4.9723401518075395e-06,
554
+ "loss": 2.7627,
555
+ "num_input_tokens_seen": 16711136,
556
+ "step": 28000
557
+ },
558
+ {
559
+ "epoch": 0.9166345040524894,
560
+ "grad_norm": 7.5000224113464355,
561
+ "learning_rate": 4.168274797375531e-06,
562
+ "loss": 2.7687,
563
+ "num_input_tokens_seen": 17005880,
564
+ "step": 28500
565
+ },
566
+ {
567
+ "epoch": 0.9327158111411296,
568
+ "grad_norm": 6.699025630950928,
569
+ "learning_rate": 3.3642094429435228e-06,
570
+ "loss": 2.779,
571
+ "num_input_tokens_seen": 17307064,
572
+ "step": 29000
573
+ },
574
+ {
575
+ "epoch": 0.9487971182297698,
576
+ "grad_norm": 6.6417131423950195,
577
+ "learning_rate": 2.560144088511514e-06,
578
+ "loss": 2.7493,
579
+ "num_input_tokens_seen": 17602296,
580
+ "step": 29500
581
+ },
582
+ {
583
+ "epoch": 0.9648784253184098,
584
+ "grad_norm": 6.775792121887207,
585
+ "learning_rate": 1.756078734079506e-06,
586
+ "loss": 2.757,
587
+ "num_input_tokens_seen": 17897832,
588
+ "step": 30000
589
+ },
590
+ {
591
+ "epoch": 0.9648784253184098,
592
+ "eval_loss": 2.5388031005859375,
593
+ "eval_runtime": 2.6123,
594
+ "eval_samples_per_second": 957.018,
595
+ "eval_steps_per_second": 119.819,
596
+ "num_input_tokens_seen": 17897832,
597
+ "step": 30000
598
+ },
599
+ {
600
+ "epoch": 0.98095973240705,
601
+ "grad_norm": 7.619235038757324,
602
+ "learning_rate": 9.520133796474978e-07,
603
+ "loss": 2.7433,
604
+ "num_input_tokens_seen": 18195568,
605
+ "step": 30500
606
+ },
607
+ {
608
+ "epoch": 0.9970410394956902,
609
+ "grad_norm": 6.682379722595215,
610
+ "learning_rate": 1.479480252154895e-07,
611
+ "loss": 2.7266,
612
+ "num_input_tokens_seen": 18491904,
613
+ "step": 31000
614
+ },
615
+ {
616
+ "epoch": 1.0131223465843304,
617
+ "grad_norm": 6.5948309898376465,
618
+ "learning_rate": 3.311462755692783e-05,
619
+ "loss": 2.6645,
620
+ "num_input_tokens_seen": 18790628,
621
+ "step": 31500
622
+ },
623
+ {
624
+ "epoch": 1.0292036536729705,
625
+ "grad_norm": 6.920671463012695,
626
+ "learning_rate": 3.2846605772117164e-05,
627
+ "loss": 2.6881,
628
+ "num_input_tokens_seen": 19090780,
629
+ "step": 32000
630
+ },
631
+ {
632
+ "epoch": 1.0452849607616108,
633
+ "grad_norm": 6.296219348907471,
634
+ "learning_rate": 3.257858398730649e-05,
635
+ "loss": 2.7024,
636
+ "num_input_tokens_seen": 19384812,
637
+ "step": 32500
638
+ },
639
+ {
640
+ "epoch": 1.0452849607616108,
641
+ "eval_loss": 2.6005640029907227,
642
+ "eval_runtime": 2.5084,
643
+ "eval_samples_per_second": 996.636,
644
+ "eval_steps_per_second": 124.779,
645
+ "num_input_tokens_seen": 19384812,
646
+ "step": 32500
647
+ },
648
+ {
649
+ "epoch": 1.061366267850251,
650
+ "grad_norm": 7.068648815155029,
651
+ "learning_rate": 3.231056220249582e-05,
652
+ "loss": 2.6939,
653
+ "num_input_tokens_seen": 19683060,
654
+ "step": 33000
655
+ },
656
+ {
657
+ "epoch": 1.077447574938891,
658
+ "grad_norm": 5.753154754638672,
659
+ "learning_rate": 3.204254041768515e-05,
660
+ "loss": 2.6977,
661
+ "num_input_tokens_seen": 19979196,
662
+ "step": 33500
663
+ },
664
+ {
665
+ "epoch": 1.0935288820275313,
666
+ "grad_norm": 8.155505180358887,
667
+ "learning_rate": 3.1774518632874485e-05,
668
+ "loss": 2.7048,
669
+ "num_input_tokens_seen": 20278524,
670
+ "step": 34000
671
+ },
672
+ {
673
+ "epoch": 1.1096101891161714,
674
+ "grad_norm": 7.031659126281738,
675
+ "learning_rate": 3.150649684806381e-05,
676
+ "loss": 2.7237,
677
+ "num_input_tokens_seen": 20577572,
678
+ "step": 34500
679
+ },
680
+ {
681
+ "epoch": 1.1256914962048115,
682
+ "grad_norm": 7.90298318862915,
683
+ "learning_rate": 3.123847506325314e-05,
684
+ "loss": 2.7248,
685
+ "num_input_tokens_seen": 20876844,
686
+ "step": 35000
687
+ },
688
+ {
689
+ "epoch": 1.1256914962048115,
690
+ "eval_loss": 2.6041972637176514,
691
+ "eval_runtime": 2.7564,
692
+ "eval_samples_per_second": 906.969,
693
+ "eval_steps_per_second": 113.552,
694
+ "num_input_tokens_seen": 20876844,
695
+ "step": 35000
696
+ },
697
+ {
698
+ "epoch": 1.1417728032934518,
699
+ "grad_norm": 6.368433475494385,
700
+ "learning_rate": 3.0970453278442473e-05,
701
+ "loss": 2.7246,
702
+ "num_input_tokens_seen": 21179820,
703
+ "step": 35500
704
+ },
705
+ {
706
+ "epoch": 1.1578541103820919,
707
+ "grad_norm": 7.143220901489258,
708
+ "learning_rate": 3.0702431493631805e-05,
709
+ "loss": 2.7211,
710
+ "num_input_tokens_seen": 21476172,
711
+ "step": 36000
712
+ },
713
+ {
714
+ "epoch": 1.173935417470732,
715
+ "grad_norm": 7.216341972351074,
716
+ "learning_rate": 3.0434409708821134e-05,
717
+ "loss": 2.7088,
718
+ "num_input_tokens_seen": 21774292,
719
+ "step": 36500
720
+ },
721
+ {
722
+ "epoch": 1.1900167245593722,
723
+ "grad_norm": 6.958596706390381,
724
+ "learning_rate": 3.0166387924010465e-05,
725
+ "loss": 2.7166,
726
+ "num_input_tokens_seen": 22070908,
727
+ "step": 37000
728
+ },
729
+ {
730
+ "epoch": 1.2060980316480123,
731
+ "grad_norm": 7.161530494689941,
732
+ "learning_rate": 2.9898366139199797e-05,
733
+ "loss": 2.6764,
734
+ "num_input_tokens_seen": 22372340,
735
+ "step": 37500
736
+ },
737
+ {
738
+ "epoch": 1.2060980316480123,
739
+ "eval_loss": 2.5923423767089844,
740
+ "eval_runtime": 2.6849,
741
+ "eval_samples_per_second": 931.145,
742
+ "eval_steps_per_second": 116.579,
743
+ "num_input_tokens_seen": 22372340,
744
+ "step": 37500
745
+ },
746
+ {
747
+ "epoch": 1.2221793387366526,
748
+ "grad_norm": 7.448612213134766,
749
+ "learning_rate": 2.963034435438913e-05,
750
+ "loss": 2.7098,
751
+ "num_input_tokens_seen": 22672932,
752
+ "step": 38000
753
+ },
754
+ {
755
+ "epoch": 1.2382606458252927,
756
+ "grad_norm": 8.339189529418945,
757
+ "learning_rate": 2.9362322569578454e-05,
758
+ "loss": 2.702,
759
+ "num_input_tokens_seen": 22971844,
760
+ "step": 38500
761
+ },
762
+ {
763
+ "epoch": 1.2543419529139328,
764
+ "grad_norm": 6.795124053955078,
765
+ "learning_rate": 2.9094300784767786e-05,
766
+ "loss": 2.7007,
767
+ "num_input_tokens_seen": 23266964,
768
+ "step": 39000
769
+ },
770
+ {
771
+ "epoch": 1.2704232600025729,
772
+ "grad_norm": 6.3036298751831055,
773
+ "learning_rate": 2.8826278999957118e-05,
774
+ "loss": 2.71,
775
+ "num_input_tokens_seen": 23564068,
776
+ "step": 39500
777
+ },
778
+ {
779
+ "epoch": 1.2865045670912132,
780
+ "grad_norm": 8.75069808959961,
781
+ "learning_rate": 2.855825721514645e-05,
782
+ "loss": 2.6854,
783
+ "num_input_tokens_seen": 23866100,
784
+ "step": 40000
785
+ },
786
+ {
787
+ "epoch": 1.2865045670912132,
788
+ "eval_loss": 2.5792863368988037,
789
+ "eval_runtime": 2.5776,
790
+ "eval_samples_per_second": 969.878,
791
+ "eval_steps_per_second": 121.429,
792
+ "num_input_tokens_seen": 23866100,
793
+ "step": 40000
794
+ },
795
+ {
796
+ "epoch": 1.3025858741798533,
797
+ "grad_norm": 6.966170310974121,
798
+ "learning_rate": 2.8290235430335778e-05,
799
+ "loss": 2.697,
800
+ "num_input_tokens_seen": 24162356,
801
+ "step": 40500
802
+ },
803
+ {
804
+ "epoch": 1.3186671812684936,
805
+ "grad_norm": 7.854964733123779,
806
+ "learning_rate": 2.802221364552511e-05,
807
+ "loss": 2.6954,
808
+ "num_input_tokens_seen": 24458980,
809
+ "step": 41000
810
+ },
811
+ {
812
+ "epoch": 1.3347484883571337,
813
+ "grad_norm": 7.1461944580078125,
814
+ "learning_rate": 2.775419186071444e-05,
815
+ "loss": 2.6839,
816
+ "num_input_tokens_seen": 24757828,
817
+ "step": 41500
818
+ },
819
+ {
820
+ "epoch": 1.3508297954457738,
821
+ "grad_norm": 8.25295639038086,
822
+ "learning_rate": 2.7486170075903773e-05,
823
+ "loss": 2.7035,
824
+ "num_input_tokens_seen": 25052236,
825
+ "step": 42000
826
+ },
827
+ {
828
+ "epoch": 1.366911102534414,
829
+ "grad_norm": 6.336223602294922,
830
+ "learning_rate": 2.7218148291093105e-05,
831
+ "loss": 2.683,
832
+ "num_input_tokens_seen": 25348084,
833
+ "step": 42500
834
+ },
835
+ {
836
+ "epoch": 1.366911102534414,
837
+ "eval_loss": 2.5722219944000244,
838
+ "eval_runtime": 2.7384,
839
+ "eval_samples_per_second": 912.958,
840
+ "eval_steps_per_second": 114.302,
841
+ "num_input_tokens_seen": 25348084,
842
+ "step": 42500
843
+ },
844
+ {
845
+ "epoch": 1.3829924096230541,
846
+ "grad_norm": 9.477555274963379,
847
+ "learning_rate": 2.695012650628243e-05,
848
+ "loss": 2.6877,
849
+ "num_input_tokens_seen": 25642372,
850
+ "step": 43000
851
+ },
852
+ {
853
+ "epoch": 1.3990737167116944,
854
+ "grad_norm": 8.233431816101074,
855
+ "learning_rate": 2.6682104721471762e-05,
856
+ "loss": 2.6927,
857
+ "num_input_tokens_seen": 25939652,
858
+ "step": 43500
859
+ },
860
+ {
861
+ "epoch": 1.4151550238003345,
862
+ "grad_norm": 5.860446929931641,
863
+ "learning_rate": 2.6414082936661094e-05,
864
+ "loss": 2.6819,
865
+ "num_input_tokens_seen": 26248940,
866
+ "step": 44000
867
+ },
868
+ {
869
+ "epoch": 1.4312363308889746,
870
+ "grad_norm": 6.748124599456787,
871
+ "learning_rate": 2.6146061151850426e-05,
872
+ "loss": 2.6893,
873
+ "num_input_tokens_seen": 26552860,
874
+ "step": 44500
875
+ },
876
+ {
877
+ "epoch": 1.4473176379776147,
878
+ "grad_norm": 6.038182258605957,
879
+ "learning_rate": 2.587803936703975e-05,
880
+ "loss": 2.6871,
881
+ "num_input_tokens_seen": 26854100,
882
+ "step": 45000
883
+ },
884
+ {
885
+ "epoch": 1.4473176379776147,
886
+ "eval_loss": 2.5538456439971924,
887
+ "eval_runtime": 2.6078,
888
+ "eval_samples_per_second": 958.67,
889
+ "eval_steps_per_second": 120.026,
890
+ "num_input_tokens_seen": 26854100,
891
+ "step": 45000
892
+ },
893
+ {
894
+ "epoch": 1.463398945066255,
895
+ "grad_norm": 7.815784454345703,
896
+ "learning_rate": 2.5610017582229086e-05,
897
+ "loss": 2.6709,
898
+ "num_input_tokens_seen": 27148148,
899
+ "step": 45500
900
+ },
901
+ {
902
+ "epoch": 1.479480252154895,
903
+ "grad_norm": 7.8851094245910645,
904
+ "learning_rate": 2.5341995797418418e-05,
905
+ "loss": 2.6698,
906
+ "num_input_tokens_seen": 27445020,
907
+ "step": 46000
908
+ },
909
+ {
910
+ "epoch": 1.4955615592435354,
911
+ "grad_norm": 7.389246940612793,
912
+ "learning_rate": 2.507397401260775e-05,
913
+ "loss": 2.6787,
914
+ "num_input_tokens_seen": 27742908,
915
+ "step": 46500
916
+ },
917
+ {
918
+ "epoch": 1.5116428663321755,
919
+ "grad_norm": 7.621913909912109,
920
+ "learning_rate": 2.4805952227797078e-05,
921
+ "loss": 2.6713,
922
+ "num_input_tokens_seen": 28037284,
923
+ "step": 47000
924
+ },
925
+ {
926
+ "epoch": 1.5277241734208156,
927
+ "grad_norm": 7.889066219329834,
928
+ "learning_rate": 2.4537930442986407e-05,
929
+ "loss": 2.6551,
930
+ "num_input_tokens_seen": 28332612,
931
+ "step": 47500
932
+ },
933
+ {
934
+ "epoch": 1.5277241734208156,
935
+ "eval_loss": 2.5442593097686768,
936
+ "eval_runtime": 2.6341,
937
+ "eval_samples_per_second": 949.086,
938
+ "eval_steps_per_second": 118.826,
939
+ "num_input_tokens_seen": 28332612,
940
+ "step": 47500
941
+ },
942
+ {
943
+ "epoch": 1.5438054805094557,
944
+ "grad_norm": 7.912906646728516,
945
+ "learning_rate": 2.426990865817574e-05,
946
+ "loss": 2.6881,
947
+ "num_input_tokens_seen": 28630948,
948
+ "step": 48000
949
+ },
950
+ {
951
+ "epoch": 1.559886787598096,
952
+ "grad_norm": 6.370878219604492,
953
+ "learning_rate": 2.4001886873365067e-05,
954
+ "loss": 2.6424,
955
+ "num_input_tokens_seen": 28928732,
956
+ "step": 48500
957
+ },
958
+ {
959
+ "epoch": 1.5759680946867363,
960
+ "grad_norm": 7.0892653465271,
961
+ "learning_rate": 2.37338650885544e-05,
962
+ "loss": 2.6626,
963
+ "num_input_tokens_seen": 29224436,
964
+ "step": 49000
965
+ },
966
+ {
967
+ "epoch": 1.5920494017753763,
968
+ "grad_norm": 6.357864856719971,
969
+ "learning_rate": 2.3465843303743727e-05,
970
+ "loss": 2.6546,
971
+ "num_input_tokens_seen": 29520148,
972
+ "step": 49500
973
+ },
974
+ {
975
+ "epoch": 1.6081307088640164,
976
+ "grad_norm": 8.4866943359375,
977
+ "learning_rate": 2.319782151893306e-05,
978
+ "loss": 2.661,
979
+ "num_input_tokens_seen": 29822156,
980
+ "step": 50000
981
+ },
982
+ {
983
+ "epoch": 1.6081307088640164,
984
+ "eval_loss": 2.527804374694824,
985
+ "eval_runtime": 2.6771,
986
+ "eval_samples_per_second": 933.855,
987
+ "eval_steps_per_second": 116.919,
988
+ "num_input_tokens_seen": 29822156,
989
+ "step": 50000
990
+ },
991
+ {
992
+ "epoch": 1.6242120159526565,
993
+ "grad_norm": 6.843733787536621,
994
+ "learning_rate": 2.292979973412239e-05,
995
+ "loss": 2.6521,
996
+ "num_input_tokens_seen": 30122052,
997
+ "step": 50500
998
+ },
999
+ {
1000
+ "epoch": 1.6402933230412968,
1001
+ "grad_norm": 6.88835334777832,
1002
+ "learning_rate": 2.2661777949311722e-05,
1003
+ "loss": 2.6614,
1004
+ "num_input_tokens_seen": 30422196,
1005
+ "step": 51000
1006
+ },
1007
+ {
1008
+ "epoch": 1.656374630129937,
1009
+ "grad_norm": 5.855214595794678,
1010
+ "learning_rate": 2.239375616450105e-05,
1011
+ "loss": 2.6454,
1012
+ "num_input_tokens_seen": 30722660,
1013
+ "step": 51500
1014
+ },
1015
+ {
1016
+ "epoch": 1.6724559372185772,
1017
+ "grad_norm": 6.58035135269165,
1018
+ "learning_rate": 2.2125734379690383e-05,
1019
+ "loss": 2.6524,
1020
+ "num_input_tokens_seen": 31018220,
1021
+ "step": 52000
1022
+ },
1023
+ {
1024
+ "epoch": 1.6885372443072173,
1025
+ "grad_norm": 6.767495155334473,
1026
+ "learning_rate": 2.185771259487971e-05,
1027
+ "loss": 2.6497,
1028
+ "num_input_tokens_seen": 31319476,
1029
+ "step": 52500
1030
+ },
1031
+ {
1032
+ "epoch": 1.6885372443072173,
1033
+ "eval_loss": 2.526638984680176,
1034
+ "eval_runtime": 2.5213,
1035
+ "eval_samples_per_second": 991.545,
1036
+ "eval_steps_per_second": 124.141,
1037
+ "num_input_tokens_seen": 31319476,
1038
+ "step": 52500
1039
+ },
1040
+ {
1041
+ "epoch": 1.7046185513958574,
1042
+ "grad_norm": 7.022729873657227,
1043
+ "learning_rate": 2.1589690810069043e-05,
1044
+ "loss": 2.6437,
1045
+ "num_input_tokens_seen": 31621308,
1046
+ "step": 53000
1047
+ },
1048
+ {
1049
+ "epoch": 1.7206998584844975,
1050
+ "grad_norm": 6.241069793701172,
1051
+ "learning_rate": 2.132166902525837e-05,
1052
+ "loss": 2.6447,
1053
+ "num_input_tokens_seen": 31917460,
1054
+ "step": 53500
1055
+ },
1056
+ {
1057
+ "epoch": 1.7367811655731378,
1058
+ "grad_norm": 7.7204084396362305,
1059
+ "learning_rate": 2.1053647240447703e-05,
1060
+ "loss": 2.6448,
1061
+ "num_input_tokens_seen": 32217596,
1062
+ "step": 54000
1063
+ },
1064
+ {
1065
+ "epoch": 1.752862472661778,
1066
+ "grad_norm": 6.703210830688477,
1067
+ "learning_rate": 2.0785625455637035e-05,
1068
+ "loss": 2.6366,
1069
+ "num_input_tokens_seen": 32513884,
1070
+ "step": 54500
1071
+ },
1072
+ {
1073
+ "epoch": 1.7689437797504182,
1074
+ "grad_norm": 6.371466159820557,
1075
+ "learning_rate": 2.0517603670826367e-05,
1076
+ "loss": 2.6281,
1077
+ "num_input_tokens_seen": 32813220,
1078
+ "step": 55000
1079
+ },
1080
+ {
1081
+ "epoch": 1.7689437797504182,
1082
+ "eval_loss": 2.5115973949432373,
1083
+ "eval_runtime": 2.5216,
1084
+ "eval_samples_per_second": 991.443,
1085
+ "eval_steps_per_second": 124.129,
1086
+ "num_input_tokens_seen": 32813220,
1087
+ "step": 55000
1088
+ },
1089
+ {
1090
+ "epoch": 1.7850250868390583,
1091
+ "grad_norm": 7.277946949005127,
1092
+ "learning_rate": 2.0249581886015695e-05,
1093
+ "loss": 2.6536,
1094
+ "num_input_tokens_seen": 33110188,
1095
+ "step": 55500
1096
+ },
1097
+ {
1098
+ "epoch": 1.8011063939276983,
1099
+ "grad_norm": 7.93104887008667,
1100
+ "learning_rate": 1.9981560101205027e-05,
1101
+ "loss": 2.5981,
1102
+ "num_input_tokens_seen": 33405980,
1103
+ "step": 56000
1104
+ },
1105
+ {
1106
+ "epoch": 1.8171877010163386,
1107
+ "grad_norm": 7.782486438751221,
1108
+ "learning_rate": 1.971353831639436e-05,
1109
+ "loss": 2.635,
1110
+ "num_input_tokens_seen": 33700596,
1111
+ "step": 56500
1112
+ },
1113
+ {
1114
+ "epoch": 1.8332690081049787,
1115
+ "grad_norm": 8.59358024597168,
1116
+ "learning_rate": 1.9445516531583687e-05,
1117
+ "loss": 2.6269,
1118
+ "num_input_tokens_seen": 33997724,
1119
+ "step": 57000
1120
+ },
1121
+ {
1122
+ "epoch": 1.849350315193619,
1123
+ "grad_norm": 6.669950485229492,
1124
+ "learning_rate": 1.917749474677302e-05,
1125
+ "loss": 2.6067,
1126
+ "num_input_tokens_seen": 34298052,
1127
+ "step": 57500
1128
+ },
1129
+ {
1130
+ "epoch": 1.849350315193619,
1131
+ "eval_loss": 2.5047078132629395,
1132
+ "eval_runtime": 2.5169,
1133
+ "eval_samples_per_second": 993.288,
1134
+ "eval_steps_per_second": 124.36,
1135
+ "num_input_tokens_seen": 34298052,
1136
+ "step": 57500
1137
+ },
1138
+ {
1139
+ "epoch": 1.8654316222822591,
1140
+ "grad_norm": 6.265903949737549,
1141
+ "learning_rate": 1.8909472961962348e-05,
1142
+ "loss": 2.5966,
1143
+ "num_input_tokens_seen": 34593980,
1144
+ "step": 58000
1145
+ },
1146
+ {
1147
+ "epoch": 1.8815129293708992,
1148
+ "grad_norm": 7.943974018096924,
1149
+ "learning_rate": 1.864145117715168e-05,
1150
+ "loss": 2.6303,
1151
+ "num_input_tokens_seen": 34894428,
1152
+ "step": 58500
1153
+ },
1154
+ {
1155
+ "epoch": 1.8975942364595393,
1156
+ "grad_norm": 8.290629386901855,
1157
+ "learning_rate": 1.837342939234101e-05,
1158
+ "loss": 2.6303,
1159
+ "num_input_tokens_seen": 35193236,
1160
+ "step": 59000
1161
+ },
1162
+ {
1163
+ "epoch": 1.9136755435481796,
1164
+ "grad_norm": 7.974947929382324,
1165
+ "learning_rate": 1.8105407607530343e-05,
1166
+ "loss": 2.6272,
1167
+ "num_input_tokens_seen": 35486796,
1168
+ "step": 59500
1169
+ },
1170
+ {
1171
+ "epoch": 1.92975685063682,
1172
+ "grad_norm": 5.827637195587158,
1173
+ "learning_rate": 1.783738582271967e-05,
1174
+ "loss": 2.6112,
1175
+ "num_input_tokens_seen": 35783604,
1176
+ "step": 60000
1177
+ },
1178
+ {
1179
+ "epoch": 1.92975685063682,
1180
+ "eval_loss": 2.4935405254364014,
1181
+ "eval_runtime": 2.5107,
1182
+ "eval_samples_per_second": 995.741,
1183
+ "eval_steps_per_second": 124.667,
1184
+ "num_input_tokens_seen": 35783604,
1185
+ "step": 60000
1186
+ },
1187
+ {
1188
+ "epoch": 1.94583815772546,
1189
+ "grad_norm": 6.535378456115723,
1190
+ "learning_rate": 1.7569364037909003e-05,
1191
+ "loss": 2.6135,
1192
+ "num_input_tokens_seen": 36086620,
1193
+ "step": 60500
1194
+ },
1195
+ {
1196
+ "epoch": 1.9619194648141,
1197
+ "grad_norm": 6.398725986480713,
1198
+ "learning_rate": 1.7301342253098332e-05,
1199
+ "loss": 2.602,
1200
+ "num_input_tokens_seen": 36386140,
1201
+ "step": 61000
1202
+ },
1203
+ {
1204
+ "epoch": 1.9780007719027402,
1205
+ "grad_norm": 6.332113265991211,
1206
+ "learning_rate": 1.7033320468287664e-05,
1207
+ "loss": 2.6258,
1208
+ "num_input_tokens_seen": 36684308,
1209
+ "step": 61500
1210
+ },
1211
+ {
1212
+ "epoch": 1.9940820789913805,
1213
+ "grad_norm": 7.8002753257751465,
1214
+ "learning_rate": 1.6765298683476992e-05,
1215
+ "loss": 2.6226,
1216
+ "num_input_tokens_seen": 36984724,
1217
+ "step": 62000
1218
+ },
1219
+ {
1220
+ "epoch": 2.0101633860800208,
1221
+ "grad_norm": 6.9457011222839355,
1222
+ "learning_rate": 1.6497276898666324e-05,
1223
+ "loss": 2.5207,
1224
+ "num_input_tokens_seen": 37281092,
1225
+ "step": 62500
1226
+ },
1227
+ {
1228
+ "epoch": 2.0101633860800208,
1229
+ "eval_loss": 2.4945950508117676,
1230
+ "eval_runtime": 2.5094,
1231
+ "eval_samples_per_second": 996.26,
1232
+ "eval_steps_per_second": 124.732,
1233
+ "num_input_tokens_seen": 37281092,
1234
+ "step": 62500
1235
+ },
1236
+ {
1237
+ "epoch": 2.026244693168661,
1238
+ "grad_norm": 7.541498184204102,
1239
+ "learning_rate": 1.6229255113855656e-05,
1240
+ "loss": 2.4728,
1241
+ "num_input_tokens_seen": 37582300,
1242
+ "step": 63000
1243
+ },
1244
+ {
1245
+ "epoch": 2.042326000257301,
1246
+ "grad_norm": 6.7798027992248535,
1247
+ "learning_rate": 1.5961233329044987e-05,
1248
+ "loss": 2.4539,
1249
+ "num_input_tokens_seen": 37880828,
1250
+ "step": 63500
1251
+ },
1252
+ {
1253
+ "epoch": 2.058407307345941,
1254
+ "grad_norm": 7.033351898193359,
1255
+ "learning_rate": 1.5693211544234316e-05,
1256
+ "loss": 2.4467,
1257
+ "num_input_tokens_seen": 38181276,
1258
+ "step": 64000
1259
+ },
1260
+ {
1261
+ "epoch": 2.074488614434581,
1262
+ "grad_norm": 6.487890720367432,
1263
+ "learning_rate": 1.5425189759423648e-05,
1264
+ "loss": 2.4764,
1265
+ "num_input_tokens_seen": 38473348,
1266
+ "step": 64500
1267
+ },
1268
+ {
1269
+ "epoch": 2.0905699215232216,
1270
+ "grad_norm": 6.955127716064453,
1271
+ "learning_rate": 1.5157167974612976e-05,
1272
+ "loss": 2.4799,
1273
+ "num_input_tokens_seen": 38768588,
1274
+ "step": 65000
1275
+ },
1276
+ {
1277
+ "epoch": 2.0905699215232216,
1278
+ "eval_loss": 2.491555690765381,
1279
+ "eval_runtime": 2.5076,
1280
+ "eval_samples_per_second": 996.967,
1281
+ "eval_steps_per_second": 124.82,
1282
+ "num_input_tokens_seen": 38768588,
1283
+ "step": 65000
1284
+ },
1285
+ {
1286
+ "epoch": 2.1066512286118617,
1287
+ "grad_norm": 6.78762674331665,
1288
+ "learning_rate": 1.4889146189802308e-05,
1289
+ "loss": 2.4726,
1290
+ "num_input_tokens_seen": 39067460,
1291
+ "step": 65500
1292
+ },
1293
+ {
1294
+ "epoch": 2.122732535700502,
1295
+ "grad_norm": 7.199331283569336,
1296
+ "learning_rate": 1.4621124404991638e-05,
1297
+ "loss": 2.4562,
1298
+ "num_input_tokens_seen": 39360244,
1299
+ "step": 66000
1300
+ },
1301
+ {
1302
+ "epoch": 2.138813842789142,
1303
+ "grad_norm": 7.353775501251221,
1304
+ "learning_rate": 1.435310262018097e-05,
1305
+ "loss": 2.4629,
1306
+ "num_input_tokens_seen": 39660020,
1307
+ "step": 66500
1308
+ },
1309
+ {
1310
+ "epoch": 2.154895149877782,
1311
+ "grad_norm": 6.827337265014648,
1312
+ "learning_rate": 1.4085080835370298e-05,
1313
+ "loss": 2.4817,
1314
+ "num_input_tokens_seen": 39960476,
1315
+ "step": 67000
1316
+ },
1317
+ {
1318
+ "epoch": 2.170976456966422,
1319
+ "grad_norm": 6.532020092010498,
1320
+ "learning_rate": 1.381705905055963e-05,
1321
+ "loss": 2.4727,
1322
+ "num_input_tokens_seen": 40252972,
1323
+ "step": 67500
1324
+ },
1325
+ {
1326
+ "epoch": 2.170976456966422,
1327
+ "eval_loss": 2.4865615367889404,
1328
+ "eval_runtime": 2.5372,
1329
+ "eval_samples_per_second": 985.335,
1330
+ "eval_steps_per_second": 123.364,
1331
+ "num_input_tokens_seen": 40252972,
1332
+ "step": 67500
1333
+ },
1334
+ {
1335
+ "epoch": 2.1870577640550626,
1336
+ "grad_norm": 6.601158142089844,
1337
+ "learning_rate": 1.354903726574896e-05,
1338
+ "loss": 2.4666,
1339
+ "num_input_tokens_seen": 40553732,
1340
+ "step": 68000
1341
+ },
1342
+ {
1343
+ "epoch": 2.2031390711437027,
1344
+ "grad_norm": 7.200645446777344,
1345
+ "learning_rate": 1.3281015480938292e-05,
1346
+ "loss": 2.4657,
1347
+ "num_input_tokens_seen": 40851196,
1348
+ "step": 68500
1349
+ },
1350
+ {
1351
+ "epoch": 2.2192203782323427,
1352
+ "grad_norm": 8.067240715026855,
1353
+ "learning_rate": 1.301299369612762e-05,
1354
+ "loss": 2.4801,
1355
+ "num_input_tokens_seen": 41149276,
1356
+ "step": 69000
1357
+ },
1358
+ {
1359
+ "epoch": 2.235301685320983,
1360
+ "grad_norm": 7.724194526672363,
1361
+ "learning_rate": 1.2744971911316952e-05,
1362
+ "loss": 2.4766,
1363
+ "num_input_tokens_seen": 41448540,
1364
+ "step": 69500
1365
+ },
1366
+ {
1367
+ "epoch": 2.251382992409623,
1368
+ "grad_norm": 6.999200344085693,
1369
+ "learning_rate": 1.2476950126506282e-05,
1370
+ "loss": 2.4719,
1371
+ "num_input_tokens_seen": 41746300,
1372
+ "step": 70000
1373
+ },
1374
+ {
1375
+ "epoch": 2.251382992409623,
1376
+ "eval_loss": 2.476020097732544,
1377
+ "eval_runtime": 2.5166,
1378
+ "eval_samples_per_second": 993.407,
1379
+ "eval_steps_per_second": 124.375,
1380
+ "num_input_tokens_seen": 41746300,
1381
+ "step": 70000
1382
+ },
1383
+ {
1384
+ "epoch": 2.267464299498263,
1385
+ "grad_norm": 6.666884899139404,
1386
+ "learning_rate": 1.2208928341695614e-05,
1387
+ "loss": 2.4771,
1388
+ "num_input_tokens_seen": 42042532,
1389
+ "step": 70500
1390
+ },
1391
+ {
1392
+ "epoch": 2.2835456065869035,
1393
+ "grad_norm": 8.354509353637695,
1394
+ "learning_rate": 1.1940906556884944e-05,
1395
+ "loss": 2.4679,
1396
+ "num_input_tokens_seen": 42341628,
1397
+ "step": 71000
1398
+ },
1399
+ {
1400
+ "epoch": 2.2996269136755436,
1401
+ "grad_norm": 8.39284610748291,
1402
+ "learning_rate": 1.1672884772074275e-05,
1403
+ "loss": 2.4597,
1404
+ "num_input_tokens_seen": 42642948,
1405
+ "step": 71500
1406
+ },
1407
+ {
1408
+ "epoch": 2.3157082207641837,
1409
+ "grad_norm": 7.233700275421143,
1410
+ "learning_rate": 1.1404862987263605e-05,
1411
+ "loss": 2.4592,
1412
+ "num_input_tokens_seen": 42941820,
1413
+ "step": 72000
1414
+ },
1415
+ {
1416
+ "epoch": 2.331789527852824,
1417
+ "grad_norm": 7.843503475189209,
1418
+ "learning_rate": 1.1136841202452935e-05,
1419
+ "loss": 2.4738,
1420
+ "num_input_tokens_seen": 43241188,
1421
+ "step": 72500
1422
+ },
1423
+ {
1424
+ "epoch": 2.331789527852824,
1425
+ "eval_loss": 2.47127103805542,
1426
+ "eval_runtime": 2.5277,
1427
+ "eval_samples_per_second": 989.05,
1428
+ "eval_steps_per_second": 123.829,
1429
+ "num_input_tokens_seen": 43241188,
1430
+ "step": 72500
1431
+ },
1432
+ {
1433
+ "epoch": 2.347870834941464,
1434
+ "grad_norm": 6.37482213973999,
1435
+ "learning_rate": 1.0868819417642267e-05,
1436
+ "loss": 2.4576,
1437
+ "num_input_tokens_seen": 43535900,
1438
+ "step": 73000
1439
+ },
1440
+ {
1441
+ "epoch": 2.3639521420301044,
1442
+ "grad_norm": 6.642532825469971,
1443
+ "learning_rate": 1.0600797632831597e-05,
1444
+ "loss": 2.467,
1445
+ "num_input_tokens_seen": 43833516,
1446
+ "step": 73500
1447
+ },
1448
+ {
1449
+ "epoch": 2.3800334491187445,
1450
+ "grad_norm": 6.606197357177734,
1451
+ "learning_rate": 1.0332775848020927e-05,
1452
+ "loss": 2.4752,
1453
+ "num_input_tokens_seen": 44134084,
1454
+ "step": 74000
1455
+ },
1456
+ {
1457
+ "epoch": 2.3961147562073846,
1458
+ "grad_norm": 6.338978290557861,
1459
+ "learning_rate": 1.0064754063210257e-05,
1460
+ "loss": 2.4473,
1461
+ "num_input_tokens_seen": 44432540,
1462
+ "step": 74500
1463
+ },
1464
+ {
1465
+ "epoch": 2.4121960632960247,
1466
+ "grad_norm": 7.172792434692383,
1467
+ "learning_rate": 9.796732278399589e-06,
1468
+ "loss": 2.4629,
1469
+ "num_input_tokens_seen": 44730244,
1470
+ "step": 75000
1471
+ },
1472
+ {
1473
+ "epoch": 2.4121960632960247,
1474
+ "eval_loss": 2.4629955291748047,
1475
+ "eval_runtime": 2.521,
1476
+ "eval_samples_per_second": 991.68,
1477
+ "eval_steps_per_second": 124.158,
1478
+ "num_input_tokens_seen": 44730244,
1479
+ "step": 75000
1480
+ },
1481
+ {
1482
+ "epoch": 2.4282773703846647,
1483
+ "grad_norm": 6.65930700302124,
1484
+ "learning_rate": 9.528710493588919e-06,
1485
+ "loss": 2.4512,
1486
+ "num_input_tokens_seen": 45031884,
1487
+ "step": 75500
1488
+ },
1489
+ {
1490
+ "epoch": 2.4443586774733053,
1491
+ "grad_norm": 7.209745407104492,
1492
+ "learning_rate": 9.260688708778249e-06,
1493
+ "loss": 2.4557,
1494
+ "num_input_tokens_seen": 45334924,
1495
+ "step": 76000
1496
+ },
1497
+ {
1498
+ "epoch": 2.4604399845619453,
1499
+ "grad_norm": 6.847073078155518,
1500
+ "learning_rate": 8.992666923967581e-06,
1501
+ "loss": 2.4512,
1502
+ "num_input_tokens_seen": 45633532,
1503
+ "step": 76500
1504
+ },
1505
+ {
1506
+ "epoch": 2.4765212916505854,
1507
+ "grad_norm": 7.705162525177002,
1508
+ "learning_rate": 8.724645139156911e-06,
1509
+ "loss": 2.4568,
1510
+ "num_input_tokens_seen": 45933804,
1511
+ "step": 77000
1512
+ },
1513
+ {
1514
+ "epoch": 2.4926025987392255,
1515
+ "grad_norm": 7.5681962966918945,
1516
+ "learning_rate": 8.456623354346243e-06,
1517
+ "loss": 2.4524,
1518
+ "num_input_tokens_seen": 46231060,
1519
+ "step": 77500
1520
+ },
1521
+ {
1522
+ "epoch": 2.4926025987392255,
1523
+ "eval_loss": 2.457481861114502,
1524
+ "eval_runtime": 2.5921,
1525
+ "eval_samples_per_second": 964.467,
1526
+ "eval_steps_per_second": 120.751,
1527
+ "num_input_tokens_seen": 46231060,
1528
+ "step": 77500
1529
+ },
1530
+ {
1531
+ "epoch": 2.5086839058278656,
1532
+ "grad_norm": 6.8857269287109375,
1533
+ "learning_rate": 8.188601569535573e-06,
1534
+ "loss": 2.4622,
1535
+ "num_input_tokens_seen": 46525772,
1536
+ "step": 78000
1537
+ },
1538
+ {
1539
+ "epoch": 2.524765212916506,
1540
+ "grad_norm": 6.347681522369385,
1541
+ "learning_rate": 7.920579784724903e-06,
1542
+ "loss": 2.4528,
1543
+ "num_input_tokens_seen": 46822532,
1544
+ "step": 78500
1545
+ },
1546
+ {
1547
+ "epoch": 2.5408465200051458,
1548
+ "grad_norm": 6.935575008392334,
1549
+ "learning_rate": 7.652557999914233e-06,
1550
+ "loss": 2.4414,
1551
+ "num_input_tokens_seen": 47122964,
1552
+ "step": 79000
1553
+ },
1554
+ {
1555
+ "epoch": 2.5569278270937863,
1556
+ "grad_norm": 6.603360652923584,
1557
+ "learning_rate": 7.384536215103564e-06,
1558
+ "loss": 2.4655,
1559
+ "num_input_tokens_seen": 47423300,
1560
+ "step": 79500
1561
+ },
1562
+ {
1563
+ "epoch": 2.5730091341824264,
1564
+ "grad_norm": 7.182071208953857,
1565
+ "learning_rate": 7.116514430292895e-06,
1566
+ "loss": 2.435,
1567
+ "num_input_tokens_seen": 47718964,
1568
+ "step": 80000
1569
+ },
1570
+ {
1571
+ "epoch": 2.5730091341824264,
1572
+ "eval_loss": 2.455320358276367,
1573
+ "eval_runtime": 2.5065,
1574
+ "eval_samples_per_second": 997.4,
1575
+ "eval_steps_per_second": 124.874,
1576
+ "num_input_tokens_seen": 47718964,
1577
+ "step": 80000
1578
+ },
1579
+ {
1580
+ "epoch": 2.5890904412710665,
1581
+ "grad_norm": 7.3647260665893555,
1582
+ "learning_rate": 6.848492645482225e-06,
1583
+ "loss": 2.4356,
1584
+ "num_input_tokens_seen": 48015996,
1585
+ "step": 80500
1586
+ },
1587
+ {
1588
+ "epoch": 2.6051717483597066,
1589
+ "grad_norm": 7.950341701507568,
1590
+ "learning_rate": 6.580470860671556e-06,
1591
+ "loss": 2.4453,
1592
+ "num_input_tokens_seen": 48316420,
1593
+ "step": 81000
1594
+ },
1595
+ {
1596
+ "epoch": 2.6212530554483466,
1597
+ "grad_norm": 6.016787052154541,
1598
+ "learning_rate": 6.312449075860886e-06,
1599
+ "loss": 2.45,
1600
+ "num_input_tokens_seen": 48611452,
1601
+ "step": 81500
1602
+ },
1603
+ {
1604
+ "epoch": 2.637334362536987,
1605
+ "grad_norm": 7.281980514526367,
1606
+ "learning_rate": 6.044427291050217e-06,
1607
+ "loss": 2.4687,
1608
+ "num_input_tokens_seen": 48913668,
1609
+ "step": 82000
1610
+ },
1611
+ {
1612
+ "epoch": 2.6534156696256272,
1613
+ "grad_norm": 6.644787311553955,
1614
+ "learning_rate": 5.776405506239547e-06,
1615
+ "loss": 2.4621,
1616
+ "num_input_tokens_seen": 49209724,
1617
+ "step": 82500
1618
+ },
1619
+ {
1620
+ "epoch": 2.6534156696256272,
1621
+ "eval_loss": 2.4475488662719727,
1622
+ "eval_runtime": 2.517,
1623
+ "eval_samples_per_second": 993.239,
1624
+ "eval_steps_per_second": 124.354,
1625
+ "num_input_tokens_seen": 49209724,
1626
+ "step": 82500
1627
+ },
1628
+ {
1629
+ "epoch": 2.6694969767142673,
1630
+ "grad_norm": 6.181220054626465,
1631
+ "learning_rate": 5.508383721428878e-06,
1632
+ "loss": 2.4343,
1633
+ "num_input_tokens_seen": 49505772,
1634
+ "step": 83000
1635
+ },
1636
+ {
1637
+ "epoch": 2.6855782838029074,
1638
+ "grad_norm": 6.418393135070801,
1639
+ "learning_rate": 5.2403619366182085e-06,
1640
+ "loss": 2.4329,
1641
+ "num_input_tokens_seen": 49809956,
1642
+ "step": 83500
1643
+ },
1644
+ {
1645
+ "epoch": 2.7016595908915475,
1646
+ "grad_norm": 6.279716491699219,
1647
+ "learning_rate": 4.9723401518075395e-06,
1648
+ "loss": 2.4481,
1649
+ "num_input_tokens_seen": 50112060,
1650
+ "step": 84000
1651
+ },
1652
+ {
1653
+ "epoch": 2.717740897980188,
1654
+ "grad_norm": 6.502873420715332,
1655
+ "learning_rate": 4.70431836699687e-06,
1656
+ "loss": 2.4464,
1657
+ "num_input_tokens_seen": 50414356,
1658
+ "step": 84500
1659
+ },
1660
+ {
1661
+ "epoch": 2.733822205068828,
1662
+ "grad_norm": 6.15990686416626,
1663
+ "learning_rate": 4.4362965821862e-06,
1664
+ "loss": 2.4492,
1665
+ "num_input_tokens_seen": 50712980,
1666
+ "step": 85000
1667
+ },
1668
+ {
1669
+ "epoch": 2.733822205068828,
1670
+ "eval_loss": 2.4440150260925293,
1671
+ "eval_runtime": 2.529,
1672
+ "eval_samples_per_second": 988.544,
1673
+ "eval_steps_per_second": 123.766,
1674
+ "num_input_tokens_seen": 50712980,
1675
+ "step": 85000
1676
+ },
1677
+ {
1678
+ "epoch": 2.749903512157468,
1679
+ "grad_norm": 6.876352310180664,
1680
+ "learning_rate": 4.168274797375531e-06,
1681
+ "loss": 2.4514,
1682
+ "num_input_tokens_seen": 51012460,
1683
+ "step": 85500
1684
+ },
1685
+ {
1686
+ "epoch": 2.7659848192461083,
1687
+ "grad_norm": 7.305426597595215,
1688
+ "learning_rate": 3.900253012564861e-06,
1689
+ "loss": 2.4317,
1690
+ "num_input_tokens_seen": 51308524,
1691
+ "step": 86000
1692
+ },
1693
+ {
1694
+ "epoch": 2.7820661263347484,
1695
+ "grad_norm": 6.460892677307129,
1696
+ "learning_rate": 3.632231227754192e-06,
1697
+ "loss": 2.4559,
1698
+ "num_input_tokens_seen": 51610700,
1699
+ "step": 86500
1700
+ },
1701
+ {
1702
+ "epoch": 2.798147433423389,
1703
+ "grad_norm": 8.062651634216309,
1704
+ "learning_rate": 3.3642094429435228e-06,
1705
+ "loss": 2.4535,
1706
+ "num_input_tokens_seen": 51910236,
1707
+ "step": 87000
1708
+ },
1709
+ {
1710
+ "epoch": 2.814228740512029,
1711
+ "grad_norm": 7.140311241149902,
1712
+ "learning_rate": 3.0961876581328533e-06,
1713
+ "loss": 2.4536,
1714
+ "num_input_tokens_seen": 52204380,
1715
+ "step": 87500
1716
+ },
1717
+ {
1718
+ "epoch": 2.814228740512029,
1719
+ "eval_loss": 2.4393906593322754,
1720
+ "eval_runtime": 2.5312,
1721
+ "eval_samples_per_second": 987.685,
1722
+ "eval_steps_per_second": 123.658,
1723
+ "num_input_tokens_seen": 52204380,
1724
+ "step": 87500
1725
+ },
1726
+ {
1727
+ "epoch": 2.830310047600669,
1728
+ "grad_norm": 6.569787502288818,
1729
+ "learning_rate": 2.8281658733221834e-06,
1730
+ "loss": 2.4379,
1731
+ "num_input_tokens_seen": 52504668,
1732
+ "step": 88000
1733
+ },
1734
+ {
1735
+ "epoch": 2.846391354689309,
1736
+ "grad_norm": 7.735711097717285,
1737
+ "learning_rate": 2.560144088511514e-06,
1738
+ "loss": 2.4239,
1739
+ "num_input_tokens_seen": 52798740,
1740
+ "step": 88500
1741
+ },
1742
+ {
1743
+ "epoch": 2.8624726617779492,
1744
+ "grad_norm": 7.504124641418457,
1745
+ "learning_rate": 2.292122303700845e-06,
1746
+ "loss": 2.4427,
1747
+ "num_input_tokens_seen": 53097716,
1748
+ "step": 89000
1749
+ },
1750
+ {
1751
+ "epoch": 2.8785539688665893,
1752
+ "grad_norm": 6.647756099700928,
1753
+ "learning_rate": 2.0241005188901755e-06,
1754
+ "loss": 2.4682,
1755
+ "num_input_tokens_seen": 53397564,
1756
+ "step": 89500
1757
+ },
1758
+ {
1759
+ "epoch": 2.8946352759552294,
1760
+ "grad_norm": 6.640815734863281,
1761
+ "learning_rate": 1.756078734079506e-06,
1762
+ "loss": 2.4148,
1763
+ "num_input_tokens_seen": 53695620,
1764
+ "step": 90000
1765
+ },
1766
+ {
1767
+ "epoch": 2.8946352759552294,
1768
+ "eval_loss": 2.43597674369812,
1769
+ "eval_runtime": 2.5128,
1770
+ "eval_samples_per_second": 994.891,
1771
+ "eval_steps_per_second": 124.56,
1772
+ "num_input_tokens_seen": 53695620,
1773
+ "step": 90000
1774
+ },
1775
+ {
1776
+ "epoch": 2.91071658304387,
1777
+ "grad_norm": 7.346447467803955,
1778
+ "learning_rate": 1.4880569492688366e-06,
1779
+ "loss": 2.4352,
1780
+ "num_input_tokens_seen": 53991180,
1781
+ "step": 90500
1782
+ },
1783
+ {
1784
+ "epoch": 2.92679789013251,
1785
+ "grad_norm": 6.777767658233643,
1786
+ "learning_rate": 1.2200351644581672e-06,
1787
+ "loss": 2.4664,
1788
+ "num_input_tokens_seen": 54288348,
1789
+ "step": 91000
1790
+ },
1791
+ {
1792
+ "epoch": 2.94287919722115,
1793
+ "grad_norm": 6.908254623413086,
1794
+ "learning_rate": 9.520133796474978e-07,
1795
+ "loss": 2.4474,
1796
+ "num_input_tokens_seen": 54590740,
1797
+ "step": 91500
1798
+ },
1799
+ {
1800
+ "epoch": 2.95896050430979,
1801
+ "grad_norm": 7.04544734954834,
1802
+ "learning_rate": 6.839915948368284e-07,
1803
+ "loss": 2.4554,
1804
+ "num_input_tokens_seen": 54889220,
1805
+ "step": 92000
1806
+ },
1807
+ {
1808
+ "epoch": 2.9750418113984303,
1809
+ "grad_norm": 9.98161792755127,
1810
+ "learning_rate": 4.159698100261589e-07,
1811
+ "loss": 2.4243,
1812
+ "num_input_tokens_seen": 55190020,
1813
+ "step": 92500
1814
+ },
1815
+ {
1816
+ "epoch": 2.9750418113984303,
1817
+ "eval_loss": 2.435030460357666,
1818
+ "eval_runtime": 2.5128,
1819
+ "eval_samples_per_second": 994.908,
1820
+ "eval_steps_per_second": 124.563,
1821
+ "num_input_tokens_seen": 55190020,
1822
+ "step": 92500
1823
+ },
1824
+ {
1825
+ "epoch": 2.991123118487071,
1826
+ "grad_norm": 6.586206912994385,
1827
+ "learning_rate": 1.479480252154895e-07,
1828
+ "loss": 2.44,
1829
+ "num_input_tokens_seen": 55490868,
1830
+ "step": 93000
1831
+ },
1832
+ {
1833
+ "epoch": 3.0,
1834
+ "num_input_tokens_seen": 55653732,
1835
+ "step": 93276,
1836
+ "total_flos": 1.4738832163602432e+16,
1837
+ "train_loss": 1.7073542784139526,
1838
+ "train_runtime": 2504.3889,
1839
+ "train_samples_per_second": 297.955,
1840
+ "train_steps_per_second": 37.245,
1841
+ "train_tokens_per_second": 22216.164
1842
+ }
1843
+ ],
1844
+ "logging_steps": 500,
1845
+ "max_steps": 93276,
1846
+ "num_input_tokens_seen": 55653732,
1847
+ "num_train_epochs": 3,
1848
+ "save_steps": 2500,
1849
+ "stateful_callbacks": {
1850
+ "TrainerControl": {
1851
+ "args": {
1852
+ "should_epoch_stop": false,
1853
+ "should_evaluate": false,
1854
+ "should_log": false,
1855
+ "should_save": true,
1856
+ "should_training_stop": true
1857
+ },
1858
+ "attributes": {}
1859
+ }
1860
+ },
1861
+ "total_flos": 1.4738832163602432e+16,
1862
+ "train_batch_size": 8,
1863
+ "trial_name": null,
1864
+ "trial_params": null
1865
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b6b0e3aff2a6ecffae867eef3409d7416d0e3a25aa9423ea38026d8d618f479
3
+ size 5432
vocab.json ADDED
The diff for this file is too large to render. See raw diff