Training in progress, step 66, checkpoint
Browse files- last-checkpoint/adapter_config.json +3 -3
- last-checkpoint/adapter_model.safetensors +1 -1
- last-checkpoint/global_step66/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1 -1
- last-checkpoint/global_step66/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1 -1
- last-checkpoint/global_step66/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1 -1
- last-checkpoint/global_step66/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1 -1
- last-checkpoint/global_step66/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1 -1
- last-checkpoint/global_step66/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1 -1
- last-checkpoint/global_step66/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1 -1
- last-checkpoint/global_step66/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1 -1
- last-checkpoint/global_step66/zero_pp_rank_0_mp_rank_00_model_states.pt +1 -1
- last-checkpoint/global_step66/zero_pp_rank_1_mp_rank_00_model_states.pt +1 -1
- last-checkpoint/global_step66/zero_pp_rank_2_mp_rank_00_model_states.pt +1 -1
- last-checkpoint/global_step66/zero_pp_rank_3_mp_rank_00_model_states.pt +1 -1
- last-checkpoint/global_step66/zero_pp_rank_4_mp_rank_00_model_states.pt +1 -1
- last-checkpoint/global_step66/zero_pp_rank_5_mp_rank_00_model_states.pt +1 -1
- last-checkpoint/global_step66/zero_pp_rank_6_mp_rank_00_model_states.pt +1 -1
- last-checkpoint/global_step66/zero_pp_rank_7_mp_rank_00_model_states.pt +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/rng_state_4.pth +1 -1
- last-checkpoint/rng_state_5.pth +1 -1
- last-checkpoint/rng_state_6.pth +1 -1
- last-checkpoint/rng_state_7.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +203 -203
- last-checkpoint/training_args.bin +1 -1
last-checkpoint/adapter_config.json
CHANGED
@@ -20,13 +20,13 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"up_proj",
|
24 |
"down_proj",
|
25 |
-
"q_proj",
|
26 |
"k_proj",
|
27 |
"gate_proj",
|
28 |
"o_proj",
|
29 |
-
"v_proj"
|
|
|
|
|
30 |
],
|
31 |
"task_type": "CAUSAL_LM",
|
32 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
|
|
23 |
"down_proj",
|
|
|
24 |
"k_proj",
|
25 |
"gate_proj",
|
26 |
"o_proj",
|
27 |
+
"v_proj",
|
28 |
+
"up_proj",
|
29 |
+
"q_proj"
|
30 |
],
|
31 |
"task_type": "CAUSAL_LM",
|
32 |
"use_dora": false,
|
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 763470136
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:651204470d974333e74132ca634e50c46cab4f71d2b3bef1ed0dec3eb6aba04d
|
3 |
size 763470136
|
last-checkpoint/global_step66/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 289064656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:856881d6095b0839d3bd7514110d2cdcdc0559f6fc8cb267bc5141b3bb8fb130
|
3 |
size 289064656
|
last-checkpoint/global_step66/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 289064656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3ec5485157f503118c0e48f554d1f5520735c3097bc76d41a8443b455963ffb
|
3 |
size 289064656
|
last-checkpoint/global_step66/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 289064656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5c5675f6c03826d269461d7b487eeb925e2dd98d3705817b40437ec232b0b7a1
|
3 |
size 289064656
|
last-checkpoint/global_step66/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 289064656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f588dd44a055da72c47094f640c1b63e398913c5459f8a5ee48af60cb02399e2
|
3 |
size 289064656
|
last-checkpoint/global_step66/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 289064656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:24c8a76833729b7e642158d26cd8ecc63eb5c89c0149c1072a38619d99b3ad10
|
3 |
size 289064656
|
last-checkpoint/global_step66/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 289064656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53431f389a44260329ba5449b3c3cdd854a928d7cffcabb54e25b959e1ed251e
|
3 |
size 289064656
|
last-checkpoint/global_step66/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 289064656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6ad56bb2d798741728bf4f2e6df097e85c333f37686dd375d9f8ef96f29a457d
|
3 |
size 289064656
|
last-checkpoint/global_step66/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 289064656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9cde8be6f46a02f5fa7d6a09a30b5cc4236445fcb495af50fd87624b38a4d7b4
|
3 |
size 289064656
|
last-checkpoint/global_step66/zero_pp_rank_0_mp_rank_00_model_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 348711830
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b30b4af07705758f540ab408f607841c632d191549b988a598bf975c09b0e7cb
|
3 |
size 348711830
|
last-checkpoint/global_step66/zero_pp_rank_1_mp_rank_00_model_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 348711830
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e540c5d8fb1796d999684280ae93deb6127870750006bf833d2521eb66fd4a4
|
3 |
size 348711830
|
last-checkpoint/global_step66/zero_pp_rank_2_mp_rank_00_model_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 348711830
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4a10c1d91e1d751287fe76ac8e3146648d57be5da2137b55a4ab42cfb058cbb
|
3 |
size 348711830
|
last-checkpoint/global_step66/zero_pp_rank_3_mp_rank_00_model_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 348711830
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b73ff2f17ef507ddb68bf70c6029a92d54306396de7264db85dc76e11e5547f0
|
3 |
size 348711830
|
last-checkpoint/global_step66/zero_pp_rank_4_mp_rank_00_model_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 348711830
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:76f6a128d4879301eda5788ac3c67d3842710cad6a60e1bcf72f5ad00638ae73
|
3 |
size 348711830
|
last-checkpoint/global_step66/zero_pp_rank_5_mp_rank_00_model_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 348711830
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ea7fa0a2d9924154f3707dd356f769bb1bc7c5e4722445670e7e904b08704000
|
3 |
size 348711830
|
last-checkpoint/global_step66/zero_pp_rank_6_mp_rank_00_model_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 348711830
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af970f914ad002cb5356956219f0cc30a0646717400a1ee56afa250626413828
|
3 |
size 348711830
|
last-checkpoint/global_step66/zero_pp_rank_7_mp_rank_00_model_states.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 348711830
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:219f44c121df9d7f087bf509c1e60e443f8a17c8d25503e8e7f545185d1841e7
|
3 |
size 348711830
|
last-checkpoint/rng_state_0.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f92646c5a2fa7121ebc27f21b41b150cf9055bfe20103354daf6932bc493b7c
|
3 |
size 15920
|
last-checkpoint/rng_state_1.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:13e4bba7e58c6a0dcffa575bfc4f4a34515b83e8aaa1510b610980ff57eb0cce
|
3 |
size 15920
|
last-checkpoint/rng_state_2.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e7b17e78ab4ecfdfe23d8a98330499e2780c1777f23428c0e1ae30c2c65dead5
|
3 |
size 15920
|
last-checkpoint/rng_state_3.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3cabf58e5a6c7207c39a270554cbc1a122d69f2acb3a6524c23884ae131c30b4
|
3 |
size 15920
|
last-checkpoint/rng_state_4.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e7a2488f153c409c51dffbb327bcdf64f4d17302a658281b1f239ac084c1c80e
|
3 |
size 15920
|
last-checkpoint/rng_state_5.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1eae6522e11d0a1769e0fd4347da083b199a859689a217969c16dbc22713e5ec
|
3 |
size 15920
|
last-checkpoint/rng_state_6.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d61936499fcf57780309d38cc47b82c866dcadab1dba74812948364a7175461a
|
3 |
size 15920
|
last-checkpoint/rng_state_7.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d80b25072a00168e3d05af00108eeff37fe60f0755470a62f6ae6672dc4ad8e
|
3 |
size 15920
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e9ce871037e8d378408842390a351a4bb8856d71d37389bd1055187b26f84d4
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -10,480 +10,480 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.003067484662576687,
|
13 |
-
"grad_norm": 0.
|
14 |
-
"learning_rate":
|
15 |
"loss": 1.9557,
|
16 |
"step": 1
|
17 |
},
|
18 |
{
|
19 |
"epoch": 0.003067484662576687,
|
20 |
"eval_loss": 2.6437082290649414,
|
21 |
-
"eval_runtime": 55.
|
22 |
-
"eval_samples_per_second": 1.
|
23 |
-
"eval_steps_per_second": 0.
|
24 |
"step": 1
|
25 |
},
|
26 |
{
|
27 |
"epoch": 0.006134969325153374,
|
28 |
-
"grad_norm": 0.
|
29 |
-
"learning_rate": 5e-
|
30 |
"loss": 1.9268,
|
31 |
"step": 2
|
32 |
},
|
33 |
{
|
34 |
"epoch": 0.009202453987730062,
|
35 |
-
"grad_norm":
|
36 |
-
"learning_rate":
|
37 |
-
"loss": 1.
|
38 |
"step": 3
|
39 |
},
|
40 |
{
|
41 |
"epoch": 0.012269938650306749,
|
42 |
-
"grad_norm": 0.
|
43 |
-
"learning_rate":
|
44 |
-
"loss": 1.
|
45 |
"step": 4
|
46 |
},
|
47 |
{
|
48 |
"epoch": 0.015337423312883436,
|
49 |
-
"grad_norm": 0.
|
50 |
-
"learning_rate":
|
51 |
-
"loss": 1.
|
52 |
"step": 5
|
53 |
},
|
54 |
{
|
55 |
"epoch": 0.018404907975460124,
|
56 |
-
"grad_norm": 0.
|
57 |
-
"learning_rate":
|
58 |
-
"loss": 1.
|
59 |
"step": 6
|
60 |
},
|
61 |
{
|
62 |
"epoch": 0.02147239263803681,
|
63 |
-
"grad_norm": 0.
|
64 |
-
"learning_rate":
|
65 |
-
"loss": 1.
|
66 |
"step": 7
|
67 |
},
|
68 |
{
|
69 |
"epoch": 0.024539877300613498,
|
70 |
-
"grad_norm": 0.
|
71 |
-
"learning_rate":
|
72 |
-
"loss": 1.
|
73 |
"step": 8
|
74 |
},
|
75 |
{
|
76 |
"epoch": 0.027607361963190184,
|
77 |
-
"grad_norm": 0.
|
78 |
-
"learning_rate":
|
79 |
-
"loss": 1.
|
80 |
"step": 9
|
81 |
},
|
82 |
{
|
83 |
"epoch": 0.03067484662576687,
|
84 |
-
"grad_norm": 0.
|
85 |
-
"learning_rate":
|
86 |
-
"loss": 1.
|
87 |
"step": 10
|
88 |
},
|
89 |
{
|
90 |
"epoch": 0.03374233128834356,
|
91 |
-
"grad_norm":
|
92 |
-
"learning_rate":
|
93 |
-
"loss": 1.
|
94 |
"step": 11
|
95 |
},
|
96 |
{
|
97 |
"epoch": 0.03680981595092025,
|
98 |
-
"grad_norm": 0.
|
99 |
-
"learning_rate":
|
100 |
-
"loss": 1.
|
101 |
"step": 12
|
102 |
},
|
103 |
{
|
104 |
"epoch": 0.03987730061349693,
|
105 |
-
"grad_norm": 0.
|
106 |
-
"learning_rate":
|
107 |
-
"loss": 1.
|
108 |
"step": 13
|
109 |
},
|
110 |
{
|
111 |
"epoch": 0.04294478527607362,
|
112 |
-
"grad_norm": 0.
|
113 |
-
"learning_rate":
|
114 |
-
"loss": 1.
|
115 |
"step": 14
|
116 |
},
|
117 |
{
|
118 |
"epoch": 0.046012269938650305,
|
119 |
-
"grad_norm": 0.
|
120 |
-
"learning_rate":
|
121 |
-
"loss": 1.
|
122 |
"step": 15
|
123 |
},
|
124 |
{
|
125 |
"epoch": 0.049079754601226995,
|
126 |
-
"grad_norm": 0.
|
127 |
-
"learning_rate":
|
128 |
-
"loss": 1.
|
129 |
"step": 16
|
130 |
},
|
131 |
{
|
132 |
"epoch": 0.05214723926380368,
|
133 |
-
"grad_norm": 0.
|
134 |
-
"learning_rate":
|
135 |
-
"loss": 1.
|
136 |
"step": 17
|
137 |
},
|
138 |
{
|
139 |
"epoch": 0.05521472392638037,
|
140 |
-
"grad_norm": 0.
|
141 |
-
"learning_rate":
|
142 |
-
"loss": 1.
|
143 |
"step": 18
|
144 |
},
|
145 |
{
|
146 |
"epoch": 0.05828220858895705,
|
147 |
-
"grad_norm": 0.
|
148 |
-
"learning_rate":
|
149 |
-
"loss": 1.
|
150 |
"step": 19
|
151 |
},
|
152 |
{
|
153 |
"epoch": 0.06134969325153374,
|
154 |
-
"grad_norm": 0.
|
155 |
-
"learning_rate":
|
156 |
-
"loss": 1.
|
157 |
"step": 20
|
158 |
},
|
159 |
{
|
160 |
"epoch": 0.06441717791411043,
|
161 |
-
"grad_norm": 0.
|
162 |
-
"learning_rate":
|
163 |
-
"loss": 1.
|
164 |
"step": 21
|
165 |
},
|
166 |
{
|
167 |
"epoch": 0.06748466257668712,
|
168 |
-
"grad_norm":
|
169 |
-
"learning_rate":
|
170 |
-
"loss": 1.
|
171 |
"step": 22
|
172 |
},
|
173 |
{
|
174 |
"epoch": 0.0705521472392638,
|
175 |
-
"grad_norm": 0.
|
176 |
-
"learning_rate":
|
177 |
-
"loss": 1.
|
178 |
"step": 23
|
179 |
},
|
180 |
{
|
181 |
"epoch": 0.0736196319018405,
|
182 |
-
"grad_norm": 0.
|
183 |
-
"learning_rate":
|
184 |
-
"loss": 1.
|
185 |
"step": 24
|
186 |
},
|
187 |
{
|
188 |
"epoch": 0.07668711656441718,
|
189 |
-
"grad_norm": 0.
|
190 |
-
"learning_rate":
|
191 |
-
"loss": 1.
|
192 |
"step": 25
|
193 |
},
|
194 |
{
|
195 |
"epoch": 0.07975460122699386,
|
196 |
-
"grad_norm": 0.
|
197 |
-
"learning_rate":
|
198 |
-
"loss": 2.
|
199 |
"step": 26
|
200 |
},
|
201 |
{
|
202 |
"epoch": 0.08282208588957055,
|
203 |
-
"grad_norm":
|
204 |
-
"learning_rate":
|
205 |
-
"loss": 1.
|
206 |
"step": 27
|
207 |
},
|
208 |
{
|
209 |
"epoch": 0.08588957055214724,
|
210 |
-
"grad_norm":
|
211 |
-
"learning_rate":
|
212 |
-
"loss":
|
213 |
"step": 28
|
214 |
},
|
215 |
{
|
216 |
"epoch": 0.08895705521472393,
|
217 |
-
"grad_norm":
|
218 |
-
"learning_rate":
|
219 |
-
"loss":
|
220 |
"step": 29
|
221 |
},
|
222 |
{
|
223 |
"epoch": 0.09202453987730061,
|
224 |
-
"grad_norm": 0.
|
225 |
-
"learning_rate":
|
226 |
-
"loss":
|
227 |
"step": 30
|
228 |
},
|
229 |
{
|
230 |
"epoch": 0.0950920245398773,
|
231 |
-
"grad_norm": 0.
|
232 |
-
"learning_rate":
|
233 |
-
"loss": 1.
|
234 |
"step": 31
|
235 |
},
|
236 |
{
|
237 |
"epoch": 0.09815950920245399,
|
238 |
-
"grad_norm": 0.
|
239 |
-
"learning_rate":
|
240 |
-
"loss": 1.
|
241 |
"step": 32
|
242 |
},
|
243 |
{
|
244 |
"epoch": 0.10122699386503067,
|
245 |
-
"grad_norm": 0.
|
246 |
-
"learning_rate":
|
247 |
-
"loss": 2.
|
248 |
"step": 33
|
249 |
},
|
250 |
{
|
251 |
"epoch": 0.10429447852760736,
|
252 |
-
"grad_norm":
|
253 |
-
"learning_rate":
|
254 |
-
"loss": 1.
|
255 |
"step": 34
|
256 |
},
|
257 |
{
|
258 |
"epoch": 0.10736196319018405,
|
259 |
-
"grad_norm":
|
260 |
-
"learning_rate":
|
261 |
-
"loss":
|
262 |
"step": 35
|
263 |
},
|
264 |
{
|
265 |
"epoch": 0.11042944785276074,
|
266 |
-
"grad_norm":
|
267 |
-
"learning_rate":
|
268 |
-
"loss": 1.
|
269 |
"step": 36
|
270 |
},
|
271 |
{
|
272 |
"epoch": 0.11349693251533742,
|
273 |
-
"grad_norm": 0.
|
274 |
-
"learning_rate":
|
275 |
-
"loss": 1.
|
276 |
"step": 37
|
277 |
},
|
278 |
{
|
279 |
"epoch": 0.1165644171779141,
|
280 |
-
"grad_norm": 0.
|
281 |
-
"learning_rate":
|
282 |
-
"loss":
|
283 |
"step": 38
|
284 |
},
|
285 |
{
|
286 |
"epoch": 0.1196319018404908,
|
287 |
-
"grad_norm":
|
288 |
-
"learning_rate":
|
289 |
-
"loss": 1.
|
290 |
"step": 39
|
291 |
},
|
292 |
{
|
293 |
"epoch": 0.12269938650306748,
|
294 |
-
"grad_norm": 0.
|
295 |
-
"learning_rate":
|
296 |
-
"loss": 1.
|
297 |
"step": 40
|
298 |
},
|
299 |
{
|
300 |
"epoch": 0.12576687116564417,
|
301 |
-
"grad_norm": 0.
|
302 |
-
"learning_rate":
|
303 |
-
"loss": 1.
|
304 |
"step": 41
|
305 |
},
|
306 |
{
|
307 |
"epoch": 0.12883435582822086,
|
308 |
-
"grad_norm": 0.
|
309 |
-
"learning_rate":
|
310 |
-
"loss": 1.
|
311 |
"step": 42
|
312 |
},
|
313 |
{
|
314 |
"epoch": 0.13190184049079753,
|
315 |
-
"grad_norm": 0.
|
316 |
-
"learning_rate":
|
317 |
-
"loss": 1.
|
318 |
"step": 43
|
319 |
},
|
320 |
{
|
321 |
"epoch": 0.13496932515337423,
|
322 |
-
"grad_norm":
|
323 |
-
"learning_rate":
|
324 |
-
"loss": 1.
|
325 |
"step": 44
|
326 |
},
|
327 |
{
|
328 |
"epoch": 0.13803680981595093,
|
329 |
-
"grad_norm":
|
330 |
-
"learning_rate":
|
331 |
-
"loss": 1.
|
332 |
"step": 45
|
333 |
},
|
334 |
{
|
335 |
"epoch": 0.1411042944785276,
|
336 |
-
"grad_norm": 0.
|
337 |
-
"learning_rate":
|
338 |
-
"loss": 1.
|
339 |
"step": 46
|
340 |
},
|
341 |
{
|
342 |
"epoch": 0.1441717791411043,
|
343 |
-
"grad_norm": 0.
|
344 |
-
"learning_rate":
|
345 |
-
"loss": 1.
|
346 |
"step": 47
|
347 |
},
|
348 |
{
|
349 |
"epoch": 0.147239263803681,
|
350 |
-
"grad_norm": 0.
|
351 |
-
"learning_rate":
|
352 |
-
"loss": 1.
|
353 |
"step": 48
|
354 |
},
|
355 |
{
|
356 |
"epoch": 0.15030674846625766,
|
357 |
-
"grad_norm": 0.
|
358 |
-
"learning_rate":
|
359 |
-
"loss": 1.
|
360 |
"step": 49
|
361 |
},
|
362 |
{
|
363 |
"epoch": 0.15337423312883436,
|
364 |
-
"grad_norm": 0.
|
365 |
-
"learning_rate":
|
366 |
-
"loss": 1.
|
367 |
"step": 50
|
368 |
},
|
369 |
{
|
370 |
"epoch": 0.15644171779141106,
|
371 |
-
"grad_norm": 0.
|
372 |
-
"learning_rate":
|
373 |
-
"loss": 1.
|
374 |
"step": 51
|
375 |
},
|
376 |
{
|
377 |
"epoch": 0.15950920245398773,
|
378 |
-
"grad_norm": 0.
|
379 |
-
"learning_rate":
|
380 |
-
"loss": 1.
|
381 |
"step": 52
|
382 |
},
|
383 |
{
|
384 |
"epoch": 0.16257668711656442,
|
385 |
-
"grad_norm": 0.
|
386 |
-
"learning_rate":
|
387 |
-
"loss": 1.
|
388 |
"step": 53
|
389 |
},
|
390 |
{
|
391 |
"epoch": 0.1656441717791411,
|
392 |
-
"grad_norm": 0.
|
393 |
-
"learning_rate":
|
394 |
-
"loss": 1.
|
395 |
"step": 54
|
396 |
},
|
397 |
{
|
398 |
"epoch": 0.1687116564417178,
|
399 |
-
"grad_norm": 0.
|
400 |
-
"learning_rate":
|
401 |
-
"loss": 1.
|
402 |
"step": 55
|
403 |
},
|
404 |
{
|
405 |
"epoch": 0.17177914110429449,
|
406 |
-
"grad_norm": 0.
|
407 |
-
"learning_rate":
|
408 |
-
"loss": 1.
|
409 |
"step": 56
|
410 |
},
|
411 |
{
|
412 |
"epoch": 0.17484662576687116,
|
413 |
-
"grad_norm": 0.
|
414 |
-
"learning_rate":
|
415 |
-
"loss": 1.
|
416 |
"step": 57
|
417 |
},
|
418 |
{
|
419 |
"epoch": 0.17791411042944785,
|
420 |
-
"grad_norm": 0.
|
421 |
-
"learning_rate":
|
422 |
-
"loss": 1.
|
423 |
"step": 58
|
424 |
},
|
425 |
{
|
426 |
"epoch": 0.18098159509202455,
|
427 |
-
"grad_norm": 0.
|
428 |
-
"learning_rate":
|
429 |
-
"loss": 1.
|
430 |
"step": 59
|
431 |
},
|
432 |
{
|
433 |
"epoch": 0.18404907975460122,
|
434 |
-
"grad_norm":
|
435 |
-
"learning_rate":
|
436 |
-
"loss": 1.
|
437 |
"step": 60
|
438 |
},
|
439 |
{
|
440 |
"epoch": 0.18711656441717792,
|
441 |
-
"grad_norm": 0.
|
442 |
-
"learning_rate":
|
443 |
-
"loss": 1.
|
444 |
"step": 61
|
445 |
},
|
446 |
{
|
447 |
"epoch": 0.1901840490797546,
|
448 |
-
"grad_norm": 0.
|
449 |
-
"learning_rate":
|
450 |
-
"loss": 1.
|
451 |
"step": 62
|
452 |
},
|
453 |
{
|
454 |
"epoch": 0.19325153374233128,
|
455 |
-
"grad_norm": 0.
|
456 |
-
"learning_rate":
|
457 |
-
"loss": 1.
|
458 |
"step": 63
|
459 |
},
|
460 |
{
|
461 |
"epoch": 0.19631901840490798,
|
462 |
-
"grad_norm": 0.
|
463 |
-
"learning_rate":
|
464 |
-
"loss": 1.
|
465 |
"step": 64
|
466 |
},
|
467 |
{
|
468 |
"epoch": 0.19938650306748465,
|
469 |
-
"grad_norm":
|
470 |
-
"learning_rate":
|
471 |
-
"loss": 1.
|
472 |
"step": 65
|
473 |
},
|
474 |
{
|
475 |
"epoch": 0.20245398773006135,
|
476 |
-
"grad_norm": 0.
|
477 |
-
"learning_rate":
|
478 |
-
"loss": 1.
|
479 |
"step": 66
|
480 |
},
|
481 |
{
|
482 |
"epoch": 0.20245398773006135,
|
483 |
-
"eval_loss": 2.
|
484 |
-
"eval_runtime": 55.
|
485 |
-
"eval_samples_per_second": 1.
|
486 |
-
"eval_steps_per_second": 0.
|
487 |
"step": 66
|
488 |
}
|
489 |
],
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.003067484662576687,
|
13 |
+
"grad_norm": 0.9516617278813834,
|
14 |
+
"learning_rate": 1.25e-05,
|
15 |
"loss": 1.9557,
|
16 |
"step": 1
|
17 |
},
|
18 |
{
|
19 |
"epoch": 0.003067484662576687,
|
20 |
"eval_loss": 2.6437082290649414,
|
21 |
+
"eval_runtime": 55.5495,
|
22 |
+
"eval_samples_per_second": 1.8,
|
23 |
+
"eval_steps_per_second": 0.126,
|
24 |
"step": 1
|
25 |
},
|
26 |
{
|
27 |
"epoch": 0.006134969325153374,
|
28 |
+
"grad_norm": 0.515521728634264,
|
29 |
+
"learning_rate": 2.5e-05,
|
30 |
"loss": 1.9268,
|
31 |
"step": 2
|
32 |
},
|
33 |
{
|
34 |
"epoch": 0.009202453987730062,
|
35 |
+
"grad_norm": 1.0602168628533477,
|
36 |
+
"learning_rate": 3.75e-05,
|
37 |
+
"loss": 1.9644,
|
38 |
"step": 3
|
39 |
},
|
40 |
{
|
41 |
"epoch": 0.012269938650306749,
|
42 |
+
"grad_norm": 0.5232804296238467,
|
43 |
+
"learning_rate": 5e-05,
|
44 |
+
"loss": 1.9174,
|
45 |
"step": 4
|
46 |
},
|
47 |
{
|
48 |
"epoch": 0.015337423312883436,
|
49 |
+
"grad_norm": 0.6049728735982117,
|
50 |
+
"learning_rate": 6.25e-05,
|
51 |
+
"loss": 1.9183,
|
52 |
"step": 5
|
53 |
},
|
54 |
{
|
55 |
"epoch": 0.018404907975460124,
|
56 |
+
"grad_norm": 0.44617735370287787,
|
57 |
+
"learning_rate": 7.5e-05,
|
58 |
+
"loss": 1.9016,
|
59 |
"step": 6
|
60 |
},
|
61 |
{
|
62 |
"epoch": 0.02147239263803681,
|
63 |
+
"grad_norm": 0.5041842596415366,
|
64 |
+
"learning_rate": 8.75e-05,
|
65 |
+
"loss": 1.9706,
|
66 |
"step": 7
|
67 |
},
|
68 |
{
|
69 |
"epoch": 0.024539877300613498,
|
70 |
+
"grad_norm": 0.5697227180606876,
|
71 |
+
"learning_rate": 0.0001,
|
72 |
+
"loss": 1.9105,
|
73 |
"step": 8
|
74 |
},
|
75 |
{
|
76 |
"epoch": 0.027607361963190184,
|
77 |
+
"grad_norm": 0.3797683389810269,
|
78 |
+
"learning_rate": 0.00011250000000000001,
|
79 |
+
"loss": 1.9351,
|
80 |
"step": 9
|
81 |
},
|
82 |
{
|
83 |
"epoch": 0.03067484662576687,
|
84 |
+
"grad_norm": 0.3464113535012369,
|
85 |
+
"learning_rate": 0.000125,
|
86 |
+
"loss": 1.9347,
|
87 |
"step": 10
|
88 |
},
|
89 |
{
|
90 |
"epoch": 0.03374233128834356,
|
91 |
+
"grad_norm": 1.038453745480312,
|
92 |
+
"learning_rate": 0.0001375,
|
93 |
+
"loss": 1.9008,
|
94 |
"step": 11
|
95 |
},
|
96 |
{
|
97 |
"epoch": 0.03680981595092025,
|
98 |
+
"grad_norm": 0.5222824963828644,
|
99 |
+
"learning_rate": 0.00015,
|
100 |
+
"loss": 1.9251,
|
101 |
"step": 12
|
102 |
},
|
103 |
{
|
104 |
"epoch": 0.03987730061349693,
|
105 |
+
"grad_norm": 0.5129473208257509,
|
106 |
+
"learning_rate": 0.00016250000000000002,
|
107 |
+
"loss": 1.8613,
|
108 |
"step": 13
|
109 |
},
|
110 |
{
|
111 |
"epoch": 0.04294478527607362,
|
112 |
+
"grad_norm": 0.7292233670769845,
|
113 |
+
"learning_rate": 0.000175,
|
114 |
+
"loss": 1.9507,
|
115 |
"step": 14
|
116 |
},
|
117 |
{
|
118 |
"epoch": 0.046012269938650305,
|
119 |
+
"grad_norm": 0.6360368446619434,
|
120 |
+
"learning_rate": 0.0001875,
|
121 |
+
"loss": 1.9512,
|
122 |
"step": 15
|
123 |
},
|
124 |
{
|
125 |
"epoch": 0.049079754601226995,
|
126 |
+
"grad_norm": 0.48214017101050627,
|
127 |
+
"learning_rate": 0.0002,
|
128 |
+
"loss": 1.961,
|
129 |
"step": 16
|
130 |
},
|
131 |
{
|
132 |
"epoch": 0.05214723926380368,
|
133 |
+
"grad_norm": 0.4394229337647846,
|
134 |
+
"learning_rate": 0.0002125,
|
135 |
+
"loss": 1.9704,
|
136 |
"step": 17
|
137 |
},
|
138 |
{
|
139 |
"epoch": 0.05521472392638037,
|
140 |
+
"grad_norm": 0.3796994442046945,
|
141 |
+
"learning_rate": 0.00022500000000000002,
|
142 |
+
"loss": 1.8925,
|
143 |
"step": 18
|
144 |
},
|
145 |
{
|
146 |
"epoch": 0.05828220858895705,
|
147 |
+
"grad_norm": 0.3188673935343497,
|
148 |
+
"learning_rate": 0.0002375,
|
149 |
+
"loss": 1.969,
|
150 |
"step": 19
|
151 |
},
|
152 |
{
|
153 |
"epoch": 0.06134969325153374,
|
154 |
+
"grad_norm": 0.9883905241335006,
|
155 |
+
"learning_rate": 0.00025,
|
156 |
+
"loss": 1.9734,
|
157 |
"step": 20
|
158 |
},
|
159 |
{
|
160 |
"epoch": 0.06441717791411043,
|
161 |
+
"grad_norm": 0.42956410678121015,
|
162 |
+
"learning_rate": 0.000249994071079807,
|
163 |
+
"loss": 1.9632,
|
164 |
"step": 21
|
165 |
},
|
166 |
{
|
167 |
"epoch": 0.06748466257668712,
|
168 |
+
"grad_norm": 0.5580696830715027,
|
169 |
+
"learning_rate": 0.00024997628494415405,
|
170 |
+
"loss": 1.8911,
|
171 |
"step": 22
|
172 |
},
|
173 |
{
|
174 |
"epoch": 0.0705521472392638,
|
175 |
+
"grad_norm": 0.4247455273508192,
|
176 |
+
"learning_rate": 0.00024994664346775366,
|
177 |
+
"loss": 1.9549,
|
178 |
"step": 23
|
179 |
},
|
180 |
{
|
181 |
"epoch": 0.0736196319018405,
|
182 |
+
"grad_norm": 0.5638089571797716,
|
183 |
+
"learning_rate": 0.0002499051497749072,
|
184 |
+
"loss": 1.8903,
|
185 |
"step": 24
|
186 |
},
|
187 |
{
|
188 |
"epoch": 0.07668711656441718,
|
189 |
+
"grad_norm": 0.3337856270380794,
|
190 |
+
"learning_rate": 0.00024985180823917534,
|
191 |
+
"loss": 1.9817,
|
192 |
"step": 25
|
193 |
},
|
194 |
{
|
195 |
"epoch": 0.07975460122699386,
|
196 |
+
"grad_norm": 0.5964071002925826,
|
197 |
+
"learning_rate": 0.00024978662448291747,
|
198 |
+
"loss": 2.0113,
|
199 |
"step": 26
|
200 |
},
|
201 |
{
|
202 |
"epoch": 0.08282208588957055,
|
203 |
+
"grad_norm": 1.3108802906417165,
|
204 |
+
"learning_rate": 0.0002497096053766986,
|
205 |
+
"loss": 1.9136,
|
206 |
"step": 27
|
207 |
},
|
208 |
{
|
209 |
"epoch": 0.08588957055214724,
|
210 |
+
"grad_norm": 1.9953168113527813,
|
211 |
+
"learning_rate": 0.0002496207590385656,
|
212 |
+
"loss": 2.0042,
|
213 |
"step": 28
|
214 |
},
|
215 |
{
|
216 |
"epoch": 0.08895705521472393,
|
217 |
+
"grad_norm": 4.7288777305801615,
|
218 |
+
"learning_rate": 0.00024952009483319136,
|
219 |
+
"loss": 2.0138,
|
220 |
"step": 29
|
221 |
},
|
222 |
{
|
223 |
"epoch": 0.09202453987730061,
|
224 |
+
"grad_norm": 0.9466987524434748,
|
225 |
+
"learning_rate": 0.0002494076233708877,
|
226 |
+
"loss": 2.027,
|
227 |
"step": 30
|
228 |
},
|
229 |
{
|
230 |
"epoch": 0.0950920245398773,
|
231 |
+
"grad_norm": 0.5335701207102423,
|
232 |
+
"learning_rate": 0.000249283356506487,
|
233 |
+
"loss": 1.9497,
|
234 |
"step": 31
|
235 |
},
|
236 |
{
|
237 |
"epoch": 0.09815950920245399,
|
238 |
+
"grad_norm": 0.4530090911036831,
|
239 |
+
"learning_rate": 0.0002491473073380928,
|
240 |
+
"loss": 1.8991,
|
241 |
"step": 32
|
242 |
},
|
243 |
{
|
244 |
"epoch": 0.10122699386503067,
|
245 |
+
"grad_norm": 0.43545874771481075,
|
246 |
+
"learning_rate": 0.000248999490205699,
|
247 |
+
"loss": 2.0384,
|
248 |
"step": 33
|
249 |
},
|
250 |
{
|
251 |
"epoch": 0.10429447852760736,
|
252 |
+
"grad_norm": 3.1727083160562874,
|
253 |
+
"learning_rate": 0.00024883992068967873,
|
254 |
+
"loss": 1.9743,
|
255 |
"step": 34
|
256 |
},
|
257 |
{
|
258 |
"epoch": 0.10736196319018405,
|
259 |
+
"grad_norm": 5.481030996815809,
|
260 |
+
"learning_rate": 0.0002486686156091417,
|
261 |
+
"loss": 2.0054,
|
262 |
"step": 35
|
263 |
},
|
264 |
{
|
265 |
"epoch": 0.11042944785276074,
|
266 |
+
"grad_norm": 9.756683051815624,
|
267 |
+
"learning_rate": 0.0002484855930201617,
|
268 |
+
"loss": 1.9805,
|
269 |
"step": 36
|
270 |
},
|
271 |
{
|
272 |
"epoch": 0.11349693251533742,
|
273 |
+
"grad_norm": 0.5694221348977583,
|
274 |
+
"learning_rate": 0.0002482908722138734,
|
275 |
+
"loss": 1.9495,
|
276 |
"step": 37
|
277 |
},
|
278 |
{
|
279 |
"epoch": 0.1165644171779141,
|
280 |
+
"grad_norm": 0.4781718005749317,
|
281 |
+
"learning_rate": 0.00024808447371443896,
|
282 |
+
"loss": 2.0154,
|
283 |
"step": 38
|
284 |
},
|
285 |
{
|
286 |
"epoch": 0.1196319018404908,
|
287 |
+
"grad_norm": 2.070517297643313,
|
288 |
+
"learning_rate": 0.00024786641927688466,
|
289 |
+
"loss": 1.9294,
|
290 |
"step": 39
|
291 |
},
|
292 |
{
|
293 |
"epoch": 0.12269938650306748,
|
294 |
+
"grad_norm": 0.4269552409103539,
|
295 |
+
"learning_rate": 0.000247636731884808,
|
296 |
+
"loss": 1.9768,
|
297 |
"step": 40
|
298 |
},
|
299 |
{
|
300 |
"epoch": 0.12576687116564417,
|
301 |
+
"grad_norm": 0.39633691656297887,
|
302 |
+
"learning_rate": 0.0002473954357479551,
|
303 |
+
"loss": 1.9978,
|
304 |
"step": 41
|
305 |
},
|
306 |
{
|
307 |
"epoch": 0.12883435582822086,
|
308 |
+
"grad_norm": 0.5628682021190763,
|
309 |
+
"learning_rate": 0.0002471425562996688,
|
310 |
+
"loss": 1.9877,
|
311 |
"step": 42
|
312 |
},
|
313 |
{
|
314 |
"epoch": 0.13190184049079753,
|
315 |
+
"grad_norm": 0.4235604267637786,
|
316 |
+
"learning_rate": 0.00024687812019420806,
|
317 |
+
"loss": 1.9601,
|
318 |
"step": 43
|
319 |
},
|
320 |
{
|
321 |
"epoch": 0.13496932515337423,
|
322 |
+
"grad_norm": 1.955262598542115,
|
323 |
+
"learning_rate": 0.0002466021553039386,
|
324 |
+
"loss": 1.9665,
|
325 |
"step": 44
|
326 |
},
|
327 |
{
|
328 |
"epoch": 0.13803680981595093,
|
329 |
+
"grad_norm": 0.5343967332691423,
|
330 |
+
"learning_rate": 0.0002463146907163947,
|
331 |
+
"loss": 1.9132,
|
332 |
"step": 45
|
333 |
},
|
334 |
{
|
335 |
"epoch": 0.1411042944785276,
|
336 |
+
"grad_norm": 0.35886735373161066,
|
337 |
+
"learning_rate": 0.0002460157567312137,
|
338 |
+
"loss": 1.9353,
|
339 |
"step": 46
|
340 |
},
|
341 |
{
|
342 |
"epoch": 0.1441717791411043,
|
343 |
+
"grad_norm": 0.49035062436723287,
|
344 |
+
"learning_rate": 0.00024570538485694214,
|
345 |
+
"loss": 1.9721,
|
346 |
"step": 47
|
347 |
},
|
348 |
{
|
349 |
"epoch": 0.147239263803681,
|
350 |
+
"grad_norm": 0.3404214165006091,
|
351 |
+
"learning_rate": 0.00024538360780771465,
|
352 |
+
"loss": 1.9382,
|
353 |
"step": 48
|
354 |
},
|
355 |
{
|
356 |
"epoch": 0.15030674846625766,
|
357 |
+
"grad_norm": 0.5345047082277987,
|
358 |
+
"learning_rate": 0.00024505045949980574,
|
359 |
+
"loss": 1.9566,
|
360 |
"step": 49
|
361 |
},
|
362 |
{
|
363 |
"epoch": 0.15337423312883436,
|
364 |
+
"grad_norm": 0.33138829718017737,
|
365 |
+
"learning_rate": 0.00024470597504805516,
|
366 |
+
"loss": 1.9025,
|
367 |
"step": 50
|
368 |
},
|
369 |
{
|
370 |
"epoch": 0.15644171779141106,
|
371 |
+
"grad_norm": 0.3960289244574568,
|
372 |
+
"learning_rate": 0.00024435019076216627,
|
373 |
+
"loss": 1.9338,
|
374 |
"step": 51
|
375 |
},
|
376 |
{
|
377 |
"epoch": 0.15950920245398773,
|
378 |
+
"grad_norm": 0.44538606572029693,
|
379 |
+
"learning_rate": 0.00024398314414287938,
|
380 |
+
"loss": 1.9495,
|
381 |
"step": 52
|
382 |
},
|
383 |
{
|
384 |
"epoch": 0.16257668711656442,
|
385 |
+
"grad_norm": 0.38091673390175385,
|
386 |
+
"learning_rate": 0.00024360487387801872,
|
387 |
+
"loss": 1.9579,
|
388 |
"step": 53
|
389 |
},
|
390 |
{
|
391 |
"epoch": 0.1656441717791411,
|
392 |
+
"grad_norm": 0.3786713587133258,
|
393 |
+
"learning_rate": 0.00024321541983841468,
|
394 |
+
"loss": 1.9606,
|
395 |
"step": 54
|
396 |
},
|
397 |
{
|
398 |
"epoch": 0.1687116564417178,
|
399 |
+
"grad_norm": 0.34787683708853046,
|
400 |
+
"learning_rate": 0.00024281482307370142,
|
401 |
+
"loss": 1.9642,
|
402 |
"step": 55
|
403 |
},
|
404 |
{
|
405 |
"epoch": 0.17177914110429449,
|
406 |
+
"grad_norm": 0.7739678290668914,
|
407 |
+
"learning_rate": 0.00024240312580799,
|
408 |
+
"loss": 1.9082,
|
409 |
"step": 56
|
410 |
},
|
411 |
{
|
412 |
"epoch": 0.17484662576687116,
|
413 |
+
"grad_norm": 0.8090200041147584,
|
414 |
+
"learning_rate": 0.00024198037143541792,
|
415 |
+
"loss": 1.9458,
|
416 |
"step": 57
|
417 |
},
|
418 |
{
|
419 |
"epoch": 0.17791411042944785,
|
420 |
+
"grad_norm": 0.38965067919011226,
|
421 |
+
"learning_rate": 0.00024154660451557508,
|
422 |
+
"loss": 1.9724,
|
423 |
"step": 58
|
424 |
},
|
425 |
{
|
426 |
"epoch": 0.18098159509202455,
|
427 |
+
"grad_norm": 0.525233423512868,
|
428 |
+
"learning_rate": 0.0002411018707688073,
|
429 |
+
"loss": 1.9726,
|
430 |
"step": 59
|
431 |
},
|
432 |
{
|
433 |
"epoch": 0.18404907975460122,
|
434 |
+
"grad_norm": 0.6309847144158074,
|
435 |
+
"learning_rate": 0.00024064621707139708,
|
436 |
+
"loss": 1.8999,
|
437 |
"step": 60
|
438 |
},
|
439 |
{
|
440 |
"epoch": 0.18711656441717792,
|
441 |
+
"grad_norm": 0.8241404186554419,
|
442 |
+
"learning_rate": 0.00024017969145062278,
|
443 |
+
"loss": 1.927,
|
444 |
"step": 61
|
445 |
},
|
446 |
{
|
447 |
"epoch": 0.1901840490797546,
|
448 |
+
"grad_norm": 0.3936537378135966,
|
449 |
+
"learning_rate": 0.0002397023430796964,
|
450 |
+
"loss": 1.9457,
|
451 |
"step": 62
|
452 |
},
|
453 |
{
|
454 |
"epoch": 0.19325153374233128,
|
455 |
+
"grad_norm": 0.5030215425538933,
|
456 |
+
"learning_rate": 0.0002392142222725805,
|
457 |
+
"loss": 1.9413,
|
458 |
"step": 63
|
459 |
},
|
460 |
{
|
461 |
"epoch": 0.19631901840490798,
|
462 |
+
"grad_norm": 0.82199867849235,
|
463 |
+
"learning_rate": 0.00023871538047868512,
|
464 |
+
"loss": 1.8935,
|
465 |
"step": 64
|
466 |
},
|
467 |
{
|
468 |
"epoch": 0.19938650306748465,
|
469 |
+
"grad_norm": 0.36522090025587745,
|
470 |
+
"learning_rate": 0.00023820587027744452,
|
471 |
+
"loss": 1.8778,
|
472 |
"step": 65
|
473 |
},
|
474 |
{
|
475 |
"epoch": 0.20245398773006135,
|
476 |
+
"grad_norm": 0.44631812034158336,
|
477 |
+
"learning_rate": 0.00023768574537277558,
|
478 |
+
"loss": 1.8862,
|
479 |
"step": 66
|
480 |
},
|
481 |
{
|
482 |
"epoch": 0.20245398773006135,
|
483 |
+
"eval_loss": 2.6580638885498047,
|
484 |
+
"eval_runtime": 55.7526,
|
485 |
+
"eval_samples_per_second": 1.794,
|
486 |
+
"eval_steps_per_second": 0.126,
|
487 |
"step": 66
|
488 |
}
|
489 |
],
|
last-checkpoint/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 8120
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:46d6cb0eb1e7ca6e84cff1f8ec963246766ca8b78e905f9a2825914974167129
|
3 |
size 8120
|