SergCorsi commited on
Commit
f7a1453
1 Parent(s): d718766

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. adapter_config.json +3 -3
  2. adapter_model.safetensors +1 -1
  3. checkpoint-10/adapter_config.json +3 -3
  4. checkpoint-10/adapter_model.safetensors +1 -1
  5. checkpoint-10/optimizer.pt +1 -1
  6. checkpoint-10/scheduler.pt +1 -1
  7. checkpoint-10/trainer_state.json +10 -10
  8. checkpoint-10/training_args.bin +1 -1
  9. checkpoint-100/README.md +1 -1
  10. checkpoint-100/adapter_config.json +3 -3
  11. checkpoint-100/adapter_model.safetensors +1 -1
  12. checkpoint-100/optimizer.pt +1 -1
  13. checkpoint-100/scheduler.pt +1 -1
  14. checkpoint-100/trainer_state.json +73 -73
  15. checkpoint-100/training_args.bin +1 -1
  16. checkpoint-110/README.md +1 -1
  17. checkpoint-110/adapter_config.json +3 -3
  18. checkpoint-110/adapter_model.safetensors +1 -1
  19. checkpoint-110/optimizer.pt +1 -1
  20. checkpoint-110/scheduler.pt +1 -1
  21. checkpoint-110/trainer_state.json +79 -79
  22. checkpoint-110/training_args.bin +1 -1
  23. checkpoint-120/README.md +1 -1
  24. checkpoint-120/adapter_config.json +3 -3
  25. checkpoint-120/adapter_model.safetensors +1 -1
  26. checkpoint-120/optimizer.pt +1 -1
  27. checkpoint-120/scheduler.pt +1 -1
  28. checkpoint-120/trainer_state.json +87 -87
  29. checkpoint-120/training_args.bin +1 -1
  30. checkpoint-20/adapter_config.json +3 -3
  31. checkpoint-20/adapter_model.safetensors +1 -1
  32. checkpoint-20/optimizer.pt +1 -1
  33. checkpoint-20/scheduler.pt +1 -1
  34. checkpoint-20/trainer_state.json +17 -17
  35. checkpoint-20/training_args.bin +1 -1
  36. checkpoint-30/adapter_config.json +3 -3
  37. checkpoint-30/adapter_model.safetensors +1 -1
  38. checkpoint-30/optimizer.pt +1 -1
  39. checkpoint-30/scheduler.pt +1 -1
  40. checkpoint-30/trainer_state.json +23 -23
  41. checkpoint-30/training_args.bin +1 -1
  42. checkpoint-40/adapter_config.json +3 -3
  43. checkpoint-40/adapter_model.safetensors +1 -1
  44. checkpoint-40/optimizer.pt +1 -1
  45. checkpoint-40/scheduler.pt +1 -1
  46. checkpoint-40/trainer_state.json +29 -29
  47. checkpoint-40/training_args.bin +1 -1
  48. checkpoint-50/adapter_config.json +3 -3
  49. checkpoint-50/adapter_model.safetensors +1 -1
  50. checkpoint-50/optimizer.pt +1 -1
adapter_config.json CHANGED
@@ -20,10 +20,10 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
  "q_proj",
25
- "o_proj",
26
- "v_proj"
 
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "q_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "o_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6bdfd134cf3b5e167c3aa127bf57024a3e8ff71b6b0ea16d5493a51a01d7e317
3
  size 67143296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f90c1d3ed853f5e7e29d9c0d39bdab0cc26bd4d4ea5fbb602291f4c783b23d04
3
  size 67143296
checkpoint-10/adapter_config.json CHANGED
@@ -20,10 +20,10 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
  "q_proj",
25
- "o_proj",
26
- "v_proj"
 
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "q_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "o_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
checkpoint-10/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e16adf919d93d6441c2583be16d89fc2157635291e0c18a1835380e4dd25668
3
  size 67143296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98e20323c453a19bc80c8d7d2e78dd31c294b2902e1e5c93d6fc3bb60807b9a9
3
  size 67143296
checkpoint-10/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90d3b34bfbd9c3f0886fa09e483a0a1fa8853028f68e4ce50843d14911e15412
3
  size 134433530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e8ca82274d47b8f0c29a9ec38dc99330e73ae5a77ba65837b1561b4da10e245
3
  size 134433530
checkpoint-10/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04d2341737bca7648a4cdb3a55768450f9758f2298ef492fe1db7f093eaa1902
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47a859c720f996d82c8b4e6126df0e86212eb2bb6933303af0eacc71bf5de32f
3
  size 1064
checkpoint-10/trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 1.737181544303894,
3
  "best_model_checkpoint": "/kaggle/working/checkpoint-10",
4
  "epoch": 1.1111111111111112,
5
  "eval_steps": 10,
@@ -10,24 +10,24 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
- "grad_norm": 0.02217627689242363,
14
- "learning_rate": 0.00017777777777777779,
15
- "loss": 2.0442,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
- "eval_loss": 1.737181544303894,
21
- "eval_runtime": 35.1318,
22
- "eval_samples_per_second": 1.025,
23
- "eval_steps_per_second": 0.142,
24
  "step": 10
25
  }
26
  ],
27
  "logging_steps": 10,
28
- "max_steps": 90,
29
  "num_input_tokens_seen": 0,
30
- "num_train_epochs": 10,
31
  "save_steps": 10,
32
  "stateful_callbacks": {
33
  "EarlyStoppingCallback": {
 
1
  {
2
+ "best_metric": 1.729261875152588,
3
  "best_model_checkpoint": "/kaggle/working/checkpoint-10",
4
  "epoch": 1.1111111111111112,
5
  "eval_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
+ "grad_norm": 0.022457197308540344,
14
+ "learning_rate": 0.0001925925925925926,
15
+ "loss": 2.0406,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
+ "eval_loss": 1.729261875152588,
21
+ "eval_runtime": 34.8953,
22
+ "eval_samples_per_second": 1.032,
23
+ "eval_steps_per_second": 0.143,
24
  "step": 10
25
  }
26
  ],
27
  "logging_steps": 10,
28
+ "max_steps": 270,
29
  "num_input_tokens_seen": 0,
30
+ "num_train_epochs": 30,
31
  "save_steps": 10,
32
  "stateful_callbacks": {
33
  "EarlyStoppingCallback": {
checkpoint-10/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f0053334cab1bfd2838c507b240d05093bc205b1d5fbe6dc54a022fef5dcaa7
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
3
  size 5112
checkpoint-100/README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- library_name: peft
3
  base_model: TheBloke/Llama-2-7B-fp16
 
4
  ---
5
 
6
  # Model Card for Model ID
 
1
  ---
 
2
  base_model: TheBloke/Llama-2-7B-fp16
3
+ library_name: peft
4
  ---
5
 
6
  # Model Card for Model ID
checkpoint-100/adapter_config.json CHANGED
@@ -20,10 +20,10 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
  "v_proj",
25
- "o_proj",
26
- "q_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "q_proj",
24
  "v_proj",
25
+ "k_proj",
26
+ "o_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
checkpoint-100/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac3b59826a91a331332b5850491ffec38f48afde058dead68205fb9903924aac
3
  size 67143296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d0047624e79262540578984e129869e2d6934ddd722dbc7dfc5f942e628b000
3
  size 67143296
checkpoint-100/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49ff3c81928bee92eb73c79e2f6088612cc35f2fab427d0f73ba21269e3c8085
3
  size 134433530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df94a2231d5455b24c3ce8db2477572330be1aa99e33c02b4ac96351f08c9fc8
3
  size 134433530
checkpoint-100/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a67071c9831c7625e547eac0c0538006ee7fe06d1b1052844fd1cdb5172b8b9f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:127e093ac25e89499f96e10a77287e7041566fb667c0634628ee414b8d0443ea
3
  size 1064
checkpoint-100/trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 1.2115424871444702,
3
  "best_model_checkpoint": "/kaggle/working/checkpoint-90",
4
  "epoch": 11.11111111111111,
5
  "eval_steps": 10,
@@ -10,159 +10,159 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
- "grad_norm": 0.022282764315605164,
14
- "learning_rate": 0.0001851851851851852,
15
- "loss": 2.0424,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
- "eval_loss": 1.733155369758606,
21
- "eval_runtime": 34.5543,
22
- "eval_samples_per_second": 1.042,
23
- "eval_steps_per_second": 0.145,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 2.2222222222222223,
28
- "grad_norm": 0.018981408327817917,
29
- "learning_rate": 0.00017037037037037037,
30
- "loss": 1.6072,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 2.2222222222222223,
35
- "eval_loss": 1.5428930521011353,
36
- "eval_runtime": 34.6485,
37
- "eval_samples_per_second": 1.039,
38
- "eval_steps_per_second": 0.144,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 3.3333333333333335,
43
- "grad_norm": 0.023157037794589996,
44
- "learning_rate": 0.00015555555555555556,
45
- "loss": 1.4025,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 3.3333333333333335,
50
- "eval_loss": 1.4176721572875977,
51
- "eval_runtime": 34.5433,
52
- "eval_samples_per_second": 1.042,
53
- "eval_steps_per_second": 0.145,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 4.444444444444445,
58
- "grad_norm": 0.021338749676942825,
59
- "learning_rate": 0.00014074074074074076,
60
- "loss": 1.285,
61
  "step": 40
62
  },
63
  {
64
  "epoch": 4.444444444444445,
65
- "eval_loss": 1.3449772596359253,
66
- "eval_runtime": 34.5594,
67
- "eval_samples_per_second": 1.042,
68
- "eval_steps_per_second": 0.145,
69
  "step": 40
70
  },
71
  {
72
  "epoch": 5.555555555555555,
73
- "grad_norm": 0.02489505708217621,
74
- "learning_rate": 0.00012592592592592592,
75
- "loss": 1.1687,
76
  "step": 50
77
  },
78
  {
79
  "epoch": 5.555555555555555,
80
- "eval_loss": 1.2951068878173828,
81
- "eval_runtime": 34.5896,
82
- "eval_samples_per_second": 1.041,
83
- "eval_steps_per_second": 0.145,
84
  "step": 50
85
  },
86
  {
87
  "epoch": 6.666666666666667,
88
- "grad_norm": 0.028962766751646996,
89
- "learning_rate": 0.00011111111111111112,
90
- "loss": 1.0521,
91
  "step": 60
92
  },
93
  {
94
  "epoch": 6.666666666666667,
95
- "eval_loss": 1.2674343585968018,
96
- "eval_runtime": 34.5586,
97
- "eval_samples_per_second": 1.042,
98
- "eval_steps_per_second": 0.145,
99
  "step": 60
100
  },
101
  {
102
  "epoch": 7.777777777777778,
103
- "grad_norm": 0.033917125314474106,
104
- "learning_rate": 9.62962962962963e-05,
105
- "loss": 0.9885,
106
  "step": 70
107
  },
108
  {
109
  "epoch": 7.777777777777778,
110
- "eval_loss": 1.2424466609954834,
111
- "eval_runtime": 34.5412,
112
- "eval_samples_per_second": 1.042,
113
- "eval_steps_per_second": 0.145,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 8.88888888888889,
118
- "grad_norm": 0.03393130004405975,
119
- "learning_rate": 8.148148148148148e-05,
120
- "loss": 0.8784,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 8.88888888888889,
125
- "eval_loss": 1.2252851724624634,
126
- "eval_runtime": 34.58,
127
- "eval_samples_per_second": 1.041,
128
- "eval_steps_per_second": 0.145,
129
  "step": 80
130
  },
131
  {
132
  "epoch": 10.0,
133
- "grad_norm": 0.04081139340996742,
134
- "learning_rate": 6.666666666666667e-05,
135
- "loss": 0.8154,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 10.0,
140
- "eval_loss": 1.2115424871444702,
141
- "eval_runtime": 34.5784,
142
- "eval_samples_per_second": 1.041,
143
- "eval_steps_per_second": 0.145,
144
  "step": 90
145
  },
146
  {
147
  "epoch": 11.11111111111111,
148
- "grad_norm": 0.04114004969596863,
149
- "learning_rate": 5.185185185185185e-05,
150
- "loss": 0.7376,
151
  "step": 100
152
  },
153
  {
154
  "epoch": 11.11111111111111,
155
- "eval_loss": 1.2147088050842285,
156
- "eval_runtime": 34.595,
157
- "eval_samples_per_second": 1.041,
158
- "eval_steps_per_second": 0.145,
159
  "step": 100
160
  }
161
  ],
162
  "logging_steps": 10,
163
- "max_steps": 135,
164
  "num_input_tokens_seen": 0,
165
- "num_train_epochs": 15,
166
  "save_steps": 10,
167
  "stateful_callbacks": {
168
  "EarlyStoppingCallback": {
 
1
  {
2
+ "best_metric": 1.173593521118164,
3
  "best_model_checkpoint": "/kaggle/working/checkpoint-90",
4
  "epoch": 11.11111111111111,
5
  "eval_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
+ "grad_norm": 0.022457197308540344,
14
+ "learning_rate": 0.0001925925925925926,
15
+ "loss": 2.0406,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
+ "eval_loss": 1.729261875152588,
21
+ "eval_runtime": 34.8953,
22
+ "eval_samples_per_second": 1.032,
23
+ "eval_steps_per_second": 0.143,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 2.2222222222222223,
28
+ "grad_norm": 0.018787898123264313,
29
+ "learning_rate": 0.0001851851851851852,
30
+ "loss": 1.6016,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 2.2222222222222223,
35
+ "eval_loss": 1.5362553596496582,
36
+ "eval_runtime": 34.8752,
37
+ "eval_samples_per_second": 1.032,
38
+ "eval_steps_per_second": 0.143,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 3.3333333333333335,
43
+ "grad_norm": 0.021070128306746483,
44
+ "learning_rate": 0.00017777777777777779,
45
+ "loss": 1.3937,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 3.3333333333333335,
50
+ "eval_loss": 1.4144253730773926,
51
+ "eval_runtime": 34.9429,
52
+ "eval_samples_per_second": 1.03,
53
+ "eval_steps_per_second": 0.143,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 4.444444444444445,
58
+ "grad_norm": 0.037991978228092194,
59
+ "learning_rate": 0.00017037037037037037,
60
+ "loss": 1.2721,
61
  "step": 40
62
  },
63
  {
64
  "epoch": 4.444444444444445,
65
+ "eval_loss": 1.3360365629196167,
66
+ "eval_runtime": 34.8947,
67
+ "eval_samples_per_second": 1.032,
68
+ "eval_steps_per_second": 0.143,
69
  "step": 40
70
  },
71
  {
72
  "epoch": 5.555555555555555,
73
+ "grad_norm": 0.029117526486516,
74
+ "learning_rate": 0.00016296296296296295,
75
+ "loss": 1.1384,
76
  "step": 50
77
  },
78
  {
79
  "epoch": 5.555555555555555,
80
+ "eval_loss": 1.2785382270812988,
81
+ "eval_runtime": 34.8447,
82
+ "eval_samples_per_second": 1.033,
83
+ "eval_steps_per_second": 0.143,
84
  "step": 50
85
  },
86
  {
87
  "epoch": 6.666666666666667,
88
+ "grad_norm": 0.0317281112074852,
89
+ "learning_rate": 0.00015555555555555556,
90
+ "loss": 1.0023,
91
  "step": 60
92
  },
93
  {
94
  "epoch": 6.666666666666667,
95
+ "eval_loss": 1.2417998313903809,
96
+ "eval_runtime": 34.8141,
97
+ "eval_samples_per_second": 1.034,
98
+ "eval_steps_per_second": 0.144,
99
  "step": 60
100
  },
101
  {
102
  "epoch": 7.777777777777778,
103
+ "grad_norm": 0.034914035350084305,
104
+ "learning_rate": 0.00014814814814814815,
105
+ "loss": 0.9166,
106
  "step": 70
107
  },
108
  {
109
  "epoch": 7.777777777777778,
110
+ "eval_loss": 1.2166908979415894,
111
+ "eval_runtime": 34.8956,
112
+ "eval_samples_per_second": 1.032,
113
+ "eval_steps_per_second": 0.143,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 8.88888888888889,
118
+ "grad_norm": 0.04872061312198639,
119
+ "learning_rate": 0.00014074074074074076,
120
+ "loss": 0.7726,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 8.88888888888889,
125
+ "eval_loss": 1.19890296459198,
126
+ "eval_runtime": 34.8433,
127
+ "eval_samples_per_second": 1.033,
128
+ "eval_steps_per_second": 0.143,
129
  "step": 80
130
  },
131
  {
132
  "epoch": 10.0,
133
+ "grad_norm": 0.04901803284883499,
134
+ "learning_rate": 0.00013333333333333334,
135
+ "loss": 0.676,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 10.0,
140
+ "eval_loss": 1.173593521118164,
141
+ "eval_runtime": 34.7999,
142
+ "eval_samples_per_second": 1.034,
143
+ "eval_steps_per_second": 0.144,
144
  "step": 90
145
  },
146
  {
147
  "epoch": 11.11111111111111,
148
+ "grad_norm": 0.055481575429439545,
149
+ "learning_rate": 0.00012592592592592592,
150
+ "loss": 0.56,
151
  "step": 100
152
  },
153
  {
154
  "epoch": 11.11111111111111,
155
+ "eval_loss": 1.2059063911437988,
156
+ "eval_runtime": 34.8432,
157
+ "eval_samples_per_second": 1.033,
158
+ "eval_steps_per_second": 0.143,
159
  "step": 100
160
  }
161
  ],
162
  "logging_steps": 10,
163
+ "max_steps": 270,
164
  "num_input_tokens_seen": 0,
165
+ "num_train_epochs": 30,
166
  "save_steps": 10,
167
  "stateful_callbacks": {
168
  "EarlyStoppingCallback": {
checkpoint-100/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79c753ff1ba946038f620bad3e42a35ce583c9e8ed52b49fd22fb6614fea0f43
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
3
  size 5112
checkpoint-110/README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- library_name: peft
3
  base_model: TheBloke/Llama-2-7B-fp16
 
4
  ---
5
 
6
  # Model Card for Model ID
 
1
  ---
 
2
  base_model: TheBloke/Llama-2-7B-fp16
3
+ library_name: peft
4
  ---
5
 
6
  # Model Card for Model ID
checkpoint-110/adapter_config.json CHANGED
@@ -20,10 +20,10 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
  "v_proj",
25
- "o_proj",
26
- "q_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "q_proj",
24
  "v_proj",
25
+ "k_proj",
26
+ "o_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
checkpoint-110/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be7b95474c7a25c6961db7fe4913e88e0e78819b321a21a383b179782c22ef6c
3
  size 67143296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:144bb4cb915061effe137c30fcd2897134d3bad9790d3265733214a882cd96fa
3
  size 67143296
checkpoint-110/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:567bf2cdba466fb3da7301567fc6eee0fd77e99808ab9402911468a94017eb0a
3
  size 134433530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e75cb352b366f1f8dcb73f6cbcd9937088a39b25a9a826d16c8594e055eea58
3
  size 134433530
checkpoint-110/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c7403172deaf546b51410d54cab636a5c53264be17aa6a439e5934523944587
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ed11929090acc3d040cabb379d312daf9924e0e46fa0a1c8884a63973944e92
3
  size 1064
checkpoint-110/trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 1.2115424871444702,
3
  "best_model_checkpoint": "/kaggle/working/checkpoint-90",
4
  "epoch": 12.222222222222221,
5
  "eval_steps": 10,
@@ -10,174 +10,174 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
- "grad_norm": 0.022282764315605164,
14
- "learning_rate": 0.0001851851851851852,
15
- "loss": 2.0424,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
- "eval_loss": 1.733155369758606,
21
- "eval_runtime": 34.5543,
22
- "eval_samples_per_second": 1.042,
23
- "eval_steps_per_second": 0.145,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 2.2222222222222223,
28
- "grad_norm": 0.018981408327817917,
29
- "learning_rate": 0.00017037037037037037,
30
- "loss": 1.6072,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 2.2222222222222223,
35
- "eval_loss": 1.5428930521011353,
36
- "eval_runtime": 34.6485,
37
- "eval_samples_per_second": 1.039,
38
- "eval_steps_per_second": 0.144,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 3.3333333333333335,
43
- "grad_norm": 0.023157037794589996,
44
- "learning_rate": 0.00015555555555555556,
45
- "loss": 1.4025,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 3.3333333333333335,
50
- "eval_loss": 1.4176721572875977,
51
- "eval_runtime": 34.5433,
52
- "eval_samples_per_second": 1.042,
53
- "eval_steps_per_second": 0.145,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 4.444444444444445,
58
- "grad_norm": 0.021338749676942825,
59
- "learning_rate": 0.00014074074074074076,
60
- "loss": 1.285,
61
  "step": 40
62
  },
63
  {
64
  "epoch": 4.444444444444445,
65
- "eval_loss": 1.3449772596359253,
66
- "eval_runtime": 34.5594,
67
- "eval_samples_per_second": 1.042,
68
- "eval_steps_per_second": 0.145,
69
  "step": 40
70
  },
71
  {
72
  "epoch": 5.555555555555555,
73
- "grad_norm": 0.02489505708217621,
74
- "learning_rate": 0.00012592592592592592,
75
- "loss": 1.1687,
76
  "step": 50
77
  },
78
  {
79
  "epoch": 5.555555555555555,
80
- "eval_loss": 1.2951068878173828,
81
- "eval_runtime": 34.5896,
82
- "eval_samples_per_second": 1.041,
83
- "eval_steps_per_second": 0.145,
84
  "step": 50
85
  },
86
  {
87
  "epoch": 6.666666666666667,
88
- "grad_norm": 0.028962766751646996,
89
- "learning_rate": 0.00011111111111111112,
90
- "loss": 1.0521,
91
  "step": 60
92
  },
93
  {
94
  "epoch": 6.666666666666667,
95
- "eval_loss": 1.2674343585968018,
96
- "eval_runtime": 34.5586,
97
- "eval_samples_per_second": 1.042,
98
- "eval_steps_per_second": 0.145,
99
  "step": 60
100
  },
101
  {
102
  "epoch": 7.777777777777778,
103
- "grad_norm": 0.033917125314474106,
104
- "learning_rate": 9.62962962962963e-05,
105
- "loss": 0.9885,
106
  "step": 70
107
  },
108
  {
109
  "epoch": 7.777777777777778,
110
- "eval_loss": 1.2424466609954834,
111
- "eval_runtime": 34.5412,
112
- "eval_samples_per_second": 1.042,
113
- "eval_steps_per_second": 0.145,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 8.88888888888889,
118
- "grad_norm": 0.03393130004405975,
119
- "learning_rate": 8.148148148148148e-05,
120
- "loss": 0.8784,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 8.88888888888889,
125
- "eval_loss": 1.2252851724624634,
126
- "eval_runtime": 34.58,
127
- "eval_samples_per_second": 1.041,
128
- "eval_steps_per_second": 0.145,
129
  "step": 80
130
  },
131
  {
132
  "epoch": 10.0,
133
- "grad_norm": 0.04081139340996742,
134
- "learning_rate": 6.666666666666667e-05,
135
- "loss": 0.8154,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 10.0,
140
- "eval_loss": 1.2115424871444702,
141
- "eval_runtime": 34.5784,
142
- "eval_samples_per_second": 1.041,
143
- "eval_steps_per_second": 0.145,
144
  "step": 90
145
  },
146
  {
147
  "epoch": 11.11111111111111,
148
- "grad_norm": 0.04114004969596863,
149
- "learning_rate": 5.185185185185185e-05,
150
- "loss": 0.7376,
151
  "step": 100
152
  },
153
  {
154
  "epoch": 11.11111111111111,
155
- "eval_loss": 1.2147088050842285,
156
- "eval_runtime": 34.595,
157
- "eval_samples_per_second": 1.041,
158
- "eval_steps_per_second": 0.145,
159
  "step": 100
160
  },
161
  {
162
  "epoch": 12.222222222222221,
163
- "grad_norm": 0.04217207431793213,
164
- "learning_rate": 3.7037037037037037e-05,
165
- "loss": 0.6642,
166
  "step": 110
167
  },
168
  {
169
  "epoch": 12.222222222222221,
170
- "eval_loss": 1.2141155004501343,
171
- "eval_runtime": 34.6053,
172
- "eval_samples_per_second": 1.04,
173
  "eval_steps_per_second": 0.144,
174
  "step": 110
175
  }
176
  ],
177
  "logging_steps": 10,
178
- "max_steps": 135,
179
  "num_input_tokens_seen": 0,
180
- "num_train_epochs": 15,
181
  "save_steps": 10,
182
  "stateful_callbacks": {
183
  "EarlyStoppingCallback": {
 
1
  {
2
+ "best_metric": 1.173593521118164,
3
  "best_model_checkpoint": "/kaggle/working/checkpoint-90",
4
  "epoch": 12.222222222222221,
5
  "eval_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
+ "grad_norm": 0.022457197308540344,
14
+ "learning_rate": 0.0001925925925925926,
15
+ "loss": 2.0406,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
+ "eval_loss": 1.729261875152588,
21
+ "eval_runtime": 34.8953,
22
+ "eval_samples_per_second": 1.032,
23
+ "eval_steps_per_second": 0.143,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 2.2222222222222223,
28
+ "grad_norm": 0.018787898123264313,
29
+ "learning_rate": 0.0001851851851851852,
30
+ "loss": 1.6016,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 2.2222222222222223,
35
+ "eval_loss": 1.5362553596496582,
36
+ "eval_runtime": 34.8752,
37
+ "eval_samples_per_second": 1.032,
38
+ "eval_steps_per_second": 0.143,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 3.3333333333333335,
43
+ "grad_norm": 0.021070128306746483,
44
+ "learning_rate": 0.00017777777777777779,
45
+ "loss": 1.3937,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 3.3333333333333335,
50
+ "eval_loss": 1.4144253730773926,
51
+ "eval_runtime": 34.9429,
52
+ "eval_samples_per_second": 1.03,
53
+ "eval_steps_per_second": 0.143,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 4.444444444444445,
58
+ "grad_norm": 0.037991978228092194,
59
+ "learning_rate": 0.00017037037037037037,
60
+ "loss": 1.2721,
61
  "step": 40
62
  },
63
  {
64
  "epoch": 4.444444444444445,
65
+ "eval_loss": 1.3360365629196167,
66
+ "eval_runtime": 34.8947,
67
+ "eval_samples_per_second": 1.032,
68
+ "eval_steps_per_second": 0.143,
69
  "step": 40
70
  },
71
  {
72
  "epoch": 5.555555555555555,
73
+ "grad_norm": 0.029117526486516,
74
+ "learning_rate": 0.00016296296296296295,
75
+ "loss": 1.1384,
76
  "step": 50
77
  },
78
  {
79
  "epoch": 5.555555555555555,
80
+ "eval_loss": 1.2785382270812988,
81
+ "eval_runtime": 34.8447,
82
+ "eval_samples_per_second": 1.033,
83
+ "eval_steps_per_second": 0.143,
84
  "step": 50
85
  },
86
  {
87
  "epoch": 6.666666666666667,
88
+ "grad_norm": 0.0317281112074852,
89
+ "learning_rate": 0.00015555555555555556,
90
+ "loss": 1.0023,
91
  "step": 60
92
  },
93
  {
94
  "epoch": 6.666666666666667,
95
+ "eval_loss": 1.2417998313903809,
96
+ "eval_runtime": 34.8141,
97
+ "eval_samples_per_second": 1.034,
98
+ "eval_steps_per_second": 0.144,
99
  "step": 60
100
  },
101
  {
102
  "epoch": 7.777777777777778,
103
+ "grad_norm": 0.034914035350084305,
104
+ "learning_rate": 0.00014814814814814815,
105
+ "loss": 0.9166,
106
  "step": 70
107
  },
108
  {
109
  "epoch": 7.777777777777778,
110
+ "eval_loss": 1.2166908979415894,
111
+ "eval_runtime": 34.8956,
112
+ "eval_samples_per_second": 1.032,
113
+ "eval_steps_per_second": 0.143,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 8.88888888888889,
118
+ "grad_norm": 0.04872061312198639,
119
+ "learning_rate": 0.00014074074074074076,
120
+ "loss": 0.7726,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 8.88888888888889,
125
+ "eval_loss": 1.19890296459198,
126
+ "eval_runtime": 34.8433,
127
+ "eval_samples_per_second": 1.033,
128
+ "eval_steps_per_second": 0.143,
129
  "step": 80
130
  },
131
  {
132
  "epoch": 10.0,
133
+ "grad_norm": 0.04901803284883499,
134
+ "learning_rate": 0.00013333333333333334,
135
+ "loss": 0.676,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 10.0,
140
+ "eval_loss": 1.173593521118164,
141
+ "eval_runtime": 34.7999,
142
+ "eval_samples_per_second": 1.034,
143
+ "eval_steps_per_second": 0.144,
144
  "step": 90
145
  },
146
  {
147
  "epoch": 11.11111111111111,
148
+ "grad_norm": 0.055481575429439545,
149
+ "learning_rate": 0.00012592592592592592,
150
+ "loss": 0.56,
151
  "step": 100
152
  },
153
  {
154
  "epoch": 11.11111111111111,
155
+ "eval_loss": 1.2059063911437988,
156
+ "eval_runtime": 34.8432,
157
+ "eval_samples_per_second": 1.033,
158
+ "eval_steps_per_second": 0.143,
159
  "step": 100
160
  },
161
  {
162
  "epoch": 12.222222222222221,
163
+ "grad_norm": 0.0524757020175457,
164
+ "learning_rate": 0.00011851851851851852,
165
+ "loss": 0.4567,
166
  "step": 110
167
  },
168
  {
169
  "epoch": 12.222222222222221,
170
+ "eval_loss": 1.2077444791793823,
171
+ "eval_runtime": 34.7989,
172
+ "eval_samples_per_second": 1.035,
173
  "eval_steps_per_second": 0.144,
174
  "step": 110
175
  }
176
  ],
177
  "logging_steps": 10,
178
+ "max_steps": 270,
179
  "num_input_tokens_seen": 0,
180
+ "num_train_epochs": 30,
181
  "save_steps": 10,
182
  "stateful_callbacks": {
183
  "EarlyStoppingCallback": {
checkpoint-110/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79c753ff1ba946038f620bad3e42a35ce583c9e8ed52b49fd22fb6614fea0f43
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
3
  size 5112
checkpoint-120/README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- library_name: peft
3
  base_model: TheBloke/Llama-2-7B-fp16
 
4
  ---
5
 
6
  # Model Card for Model ID
 
1
  ---
 
2
  base_model: TheBloke/Llama-2-7B-fp16
3
+ library_name: peft
4
  ---
5
 
6
  # Model Card for Model ID
checkpoint-120/adapter_config.json CHANGED
@@ -20,10 +20,10 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
  "v_proj",
25
- "o_proj",
26
- "q_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "q_proj",
24
  "v_proj",
25
+ "k_proj",
26
+ "o_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
checkpoint-120/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb1690155a5e4a4452a0e292d686003b08ac457e37c20099c948aae55ca8e453
3
  size 67143296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:173890a91bffdb964ce5f909803d70349ae54ba2275b81eaf33d7e10b02d2a18
3
  size 67143296
checkpoint-120/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14cc791678f8ce58b5f8b40f05113973aa7f94325c356b74d4df44ff8c1a956a
3
  size 134433530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50d2389c285ac068205e38a94cd027c8b55b17736442e923e49875d92296c9dd
3
  size 134433530
checkpoint-120/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7846da062f26e398f1065295f9c7cbaf4768d3aa6b5518863ce89b7eb9d328e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60da1fb7525eb9f93843e0f6cf6e45c012533f0f97597344050ff835287f782f
3
  size 1064
checkpoint-120/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 1.20501708984375,
3
- "best_model_checkpoint": "/kaggle/working/checkpoint-120",
4
  "epoch": 13.333333333333334,
5
  "eval_steps": 10,
6
  "global_step": 120,
@@ -10,189 +10,189 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
- "grad_norm": 0.022282764315605164,
14
- "learning_rate": 0.0001851851851851852,
15
- "loss": 2.0424,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
- "eval_loss": 1.733155369758606,
21
- "eval_runtime": 34.5543,
22
- "eval_samples_per_second": 1.042,
23
- "eval_steps_per_second": 0.145,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 2.2222222222222223,
28
- "grad_norm": 0.018981408327817917,
29
- "learning_rate": 0.00017037037037037037,
30
- "loss": 1.6072,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 2.2222222222222223,
35
- "eval_loss": 1.5428930521011353,
36
- "eval_runtime": 34.6485,
37
- "eval_samples_per_second": 1.039,
38
- "eval_steps_per_second": 0.144,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 3.3333333333333335,
43
- "grad_norm": 0.023157037794589996,
44
- "learning_rate": 0.00015555555555555556,
45
- "loss": 1.4025,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 3.3333333333333335,
50
- "eval_loss": 1.4176721572875977,
51
- "eval_runtime": 34.5433,
52
- "eval_samples_per_second": 1.042,
53
- "eval_steps_per_second": 0.145,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 4.444444444444445,
58
- "grad_norm": 0.021338749676942825,
59
- "learning_rate": 0.00014074074074074076,
60
- "loss": 1.285,
61
  "step": 40
62
  },
63
  {
64
  "epoch": 4.444444444444445,
65
- "eval_loss": 1.3449772596359253,
66
- "eval_runtime": 34.5594,
67
- "eval_samples_per_second": 1.042,
68
- "eval_steps_per_second": 0.145,
69
  "step": 40
70
  },
71
  {
72
  "epoch": 5.555555555555555,
73
- "grad_norm": 0.02489505708217621,
74
- "learning_rate": 0.00012592592592592592,
75
- "loss": 1.1687,
76
  "step": 50
77
  },
78
  {
79
  "epoch": 5.555555555555555,
80
- "eval_loss": 1.2951068878173828,
81
- "eval_runtime": 34.5896,
82
- "eval_samples_per_second": 1.041,
83
- "eval_steps_per_second": 0.145,
84
  "step": 50
85
  },
86
  {
87
  "epoch": 6.666666666666667,
88
- "grad_norm": 0.028962766751646996,
89
- "learning_rate": 0.00011111111111111112,
90
- "loss": 1.0521,
91
  "step": 60
92
  },
93
  {
94
  "epoch": 6.666666666666667,
95
- "eval_loss": 1.2674343585968018,
96
- "eval_runtime": 34.5586,
97
- "eval_samples_per_second": 1.042,
98
- "eval_steps_per_second": 0.145,
99
  "step": 60
100
  },
101
  {
102
  "epoch": 7.777777777777778,
103
- "grad_norm": 0.033917125314474106,
104
- "learning_rate": 9.62962962962963e-05,
105
- "loss": 0.9885,
106
  "step": 70
107
  },
108
  {
109
  "epoch": 7.777777777777778,
110
- "eval_loss": 1.2424466609954834,
111
- "eval_runtime": 34.5412,
112
- "eval_samples_per_second": 1.042,
113
- "eval_steps_per_second": 0.145,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 8.88888888888889,
118
- "grad_norm": 0.03393130004405975,
119
- "learning_rate": 8.148148148148148e-05,
120
- "loss": 0.8784,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 8.88888888888889,
125
- "eval_loss": 1.2252851724624634,
126
- "eval_runtime": 34.58,
127
- "eval_samples_per_second": 1.041,
128
- "eval_steps_per_second": 0.145,
129
  "step": 80
130
  },
131
  {
132
  "epoch": 10.0,
133
- "grad_norm": 0.04081139340996742,
134
- "learning_rate": 6.666666666666667e-05,
135
- "loss": 0.8154,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 10.0,
140
- "eval_loss": 1.2115424871444702,
141
- "eval_runtime": 34.5784,
142
- "eval_samples_per_second": 1.041,
143
- "eval_steps_per_second": 0.145,
144
  "step": 90
145
  },
146
  {
147
  "epoch": 11.11111111111111,
148
- "grad_norm": 0.04114004969596863,
149
- "learning_rate": 5.185185185185185e-05,
150
- "loss": 0.7376,
151
  "step": 100
152
  },
153
  {
154
  "epoch": 11.11111111111111,
155
- "eval_loss": 1.2147088050842285,
156
- "eval_runtime": 34.595,
157
- "eval_samples_per_second": 1.041,
158
- "eval_steps_per_second": 0.145,
159
  "step": 100
160
  },
161
  {
162
  "epoch": 12.222222222222221,
163
- "grad_norm": 0.04217207431793213,
164
- "learning_rate": 3.7037037037037037e-05,
165
- "loss": 0.6642,
166
  "step": 110
167
  },
168
  {
169
  "epoch": 12.222222222222221,
170
- "eval_loss": 1.2141155004501343,
171
- "eval_runtime": 34.6053,
172
- "eval_samples_per_second": 1.04,
173
  "eval_steps_per_second": 0.144,
174
  "step": 110
175
  },
176
  {
177
  "epoch": 13.333333333333334,
178
- "grad_norm": 0.04223904013633728,
179
- "learning_rate": 2.2222222222222223e-05,
180
- "loss": 0.6353,
181
  "step": 120
182
  },
183
  {
184
  "epoch": 13.333333333333334,
185
- "eval_loss": 1.20501708984375,
186
- "eval_runtime": 34.6447,
187
- "eval_samples_per_second": 1.039,
188
  "eval_steps_per_second": 0.144,
189
  "step": 120
190
  }
191
  ],
192
  "logging_steps": 10,
193
- "max_steps": 135,
194
  "num_input_tokens_seen": 0,
195
- "num_train_epochs": 15,
196
  "save_steps": 10,
197
  "stateful_callbacks": {
198
  "EarlyStoppingCallback": {
@@ -210,7 +210,7 @@
210
  "should_evaluate": false,
211
  "should_log": false,
212
  "should_save": true,
213
- "should_training_stop": false
214
  },
215
  "attributes": {}
216
  }
 
1
  {
2
+ "best_metric": 1.173593521118164,
3
+ "best_model_checkpoint": "/kaggle/working/checkpoint-90",
4
  "epoch": 13.333333333333334,
5
  "eval_steps": 10,
6
  "global_step": 120,
 
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
+ "grad_norm": 0.022457197308540344,
14
+ "learning_rate": 0.0001925925925925926,
15
+ "loss": 2.0406,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
+ "eval_loss": 1.729261875152588,
21
+ "eval_runtime": 34.8953,
22
+ "eval_samples_per_second": 1.032,
23
+ "eval_steps_per_second": 0.143,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 2.2222222222222223,
28
+ "grad_norm": 0.018787898123264313,
29
+ "learning_rate": 0.0001851851851851852,
30
+ "loss": 1.6016,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 2.2222222222222223,
35
+ "eval_loss": 1.5362553596496582,
36
+ "eval_runtime": 34.8752,
37
+ "eval_samples_per_second": 1.032,
38
+ "eval_steps_per_second": 0.143,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 3.3333333333333335,
43
+ "grad_norm": 0.021070128306746483,
44
+ "learning_rate": 0.00017777777777777779,
45
+ "loss": 1.3937,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 3.3333333333333335,
50
+ "eval_loss": 1.4144253730773926,
51
+ "eval_runtime": 34.9429,
52
+ "eval_samples_per_second": 1.03,
53
+ "eval_steps_per_second": 0.143,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 4.444444444444445,
58
+ "grad_norm": 0.037991978228092194,
59
+ "learning_rate": 0.00017037037037037037,
60
+ "loss": 1.2721,
61
  "step": 40
62
  },
63
  {
64
  "epoch": 4.444444444444445,
65
+ "eval_loss": 1.3360365629196167,
66
+ "eval_runtime": 34.8947,
67
+ "eval_samples_per_second": 1.032,
68
+ "eval_steps_per_second": 0.143,
69
  "step": 40
70
  },
71
  {
72
  "epoch": 5.555555555555555,
73
+ "grad_norm": 0.029117526486516,
74
+ "learning_rate": 0.00016296296296296295,
75
+ "loss": 1.1384,
76
  "step": 50
77
  },
78
  {
79
  "epoch": 5.555555555555555,
80
+ "eval_loss": 1.2785382270812988,
81
+ "eval_runtime": 34.8447,
82
+ "eval_samples_per_second": 1.033,
83
+ "eval_steps_per_second": 0.143,
84
  "step": 50
85
  },
86
  {
87
  "epoch": 6.666666666666667,
88
+ "grad_norm": 0.0317281112074852,
89
+ "learning_rate": 0.00015555555555555556,
90
+ "loss": 1.0023,
91
  "step": 60
92
  },
93
  {
94
  "epoch": 6.666666666666667,
95
+ "eval_loss": 1.2417998313903809,
96
+ "eval_runtime": 34.8141,
97
+ "eval_samples_per_second": 1.034,
98
+ "eval_steps_per_second": 0.144,
99
  "step": 60
100
  },
101
  {
102
  "epoch": 7.777777777777778,
103
+ "grad_norm": 0.034914035350084305,
104
+ "learning_rate": 0.00014814814814814815,
105
+ "loss": 0.9166,
106
  "step": 70
107
  },
108
  {
109
  "epoch": 7.777777777777778,
110
+ "eval_loss": 1.2166908979415894,
111
+ "eval_runtime": 34.8956,
112
+ "eval_samples_per_second": 1.032,
113
+ "eval_steps_per_second": 0.143,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 8.88888888888889,
118
+ "grad_norm": 0.04872061312198639,
119
+ "learning_rate": 0.00014074074074074076,
120
+ "loss": 0.7726,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 8.88888888888889,
125
+ "eval_loss": 1.19890296459198,
126
+ "eval_runtime": 34.8433,
127
+ "eval_samples_per_second": 1.033,
128
+ "eval_steps_per_second": 0.143,
129
  "step": 80
130
  },
131
  {
132
  "epoch": 10.0,
133
+ "grad_norm": 0.04901803284883499,
134
+ "learning_rate": 0.00013333333333333334,
135
+ "loss": 0.676,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 10.0,
140
+ "eval_loss": 1.173593521118164,
141
+ "eval_runtime": 34.7999,
142
+ "eval_samples_per_second": 1.034,
143
+ "eval_steps_per_second": 0.144,
144
  "step": 90
145
  },
146
  {
147
  "epoch": 11.11111111111111,
148
+ "grad_norm": 0.055481575429439545,
149
+ "learning_rate": 0.00012592592592592592,
150
+ "loss": 0.56,
151
  "step": 100
152
  },
153
  {
154
  "epoch": 11.11111111111111,
155
+ "eval_loss": 1.2059063911437988,
156
+ "eval_runtime": 34.8432,
157
+ "eval_samples_per_second": 1.033,
158
+ "eval_steps_per_second": 0.143,
159
  "step": 100
160
  },
161
  {
162
  "epoch": 12.222222222222221,
163
+ "grad_norm": 0.0524757020175457,
164
+ "learning_rate": 0.00011851851851851852,
165
+ "loss": 0.4567,
166
  "step": 110
167
  },
168
  {
169
  "epoch": 12.222222222222221,
170
+ "eval_loss": 1.2077444791793823,
171
+ "eval_runtime": 34.7989,
172
+ "eval_samples_per_second": 1.035,
173
  "eval_steps_per_second": 0.144,
174
  "step": 110
175
  },
176
  {
177
  "epoch": 13.333333333333334,
178
+ "grad_norm": 0.053020887076854706,
179
+ "learning_rate": 0.00011111111111111112,
180
+ "loss": 0.3915,
181
  "step": 120
182
  },
183
  {
184
  "epoch": 13.333333333333334,
185
+ "eval_loss": 1.2036480903625488,
186
+ "eval_runtime": 34.802,
187
+ "eval_samples_per_second": 1.034,
188
  "eval_steps_per_second": 0.144,
189
  "step": 120
190
  }
191
  ],
192
  "logging_steps": 10,
193
+ "max_steps": 270,
194
  "num_input_tokens_seen": 0,
195
+ "num_train_epochs": 30,
196
  "save_steps": 10,
197
  "stateful_callbacks": {
198
  "EarlyStoppingCallback": {
 
210
  "should_evaluate": false,
211
  "should_log": false,
212
  "should_save": true,
213
+ "should_training_stop": true
214
  },
215
  "attributes": {}
216
  }
checkpoint-120/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79c753ff1ba946038f620bad3e42a35ce583c9e8ed52b49fd22fb6614fea0f43
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
3
  size 5112
checkpoint-20/adapter_config.json CHANGED
@@ -20,10 +20,10 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
  "q_proj",
25
- "o_proj",
26
- "v_proj"
 
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "q_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "o_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
checkpoint-20/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5ab93af4ed23c52b82729ff3b3f871c19b732c90f1094f90d5a9f4ade1ccfac
3
  size 67143296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e350a043d6e188be3930ae109597f7418b1c57332d8722c377acf61b839280db
3
  size 67143296
checkpoint-20/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a07f3020a10b2a9d3e215c9651b159e8c3b297ab1db69b013b8c7817d5f52a7c
3
  size 134433530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53aaa85798d6640d73ad6607ea99ffb3eee0b87eb9130f5f653e9d52f119e393
3
  size 134433530
checkpoint-20/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9e7e75183c7081ca7f8f52ddfd0d5f4b8e8dbcf7f7bcd495fc6e0cfff80e3a2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c05a2d1065158cc7891c3a0806de9d2368277087e1ac23c872a14cc5ce6a082
3
  size 1064
checkpoint-20/trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 1.5489343404769897,
3
  "best_model_checkpoint": "/kaggle/working/checkpoint-20",
4
  "epoch": 2.2222222222222223,
5
  "eval_steps": 10,
@@ -10,39 +10,39 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
- "grad_norm": 0.02217627689242363,
14
- "learning_rate": 0.00017777777777777779,
15
- "loss": 2.0442,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
- "eval_loss": 1.737181544303894,
21
- "eval_runtime": 35.1318,
22
- "eval_samples_per_second": 1.025,
23
- "eval_steps_per_second": 0.142,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 2.2222222222222223,
28
- "grad_norm": 0.0346713550388813,
29
- "learning_rate": 0.00015555555555555556,
30
- "loss": 1.6131,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 2.2222222222222223,
35
- "eval_loss": 1.5489343404769897,
36
- "eval_runtime": 34.8402,
37
- "eval_samples_per_second": 1.033,
38
- "eval_steps_per_second": 0.144,
39
  "step": 20
40
  }
41
  ],
42
  "logging_steps": 10,
43
- "max_steps": 90,
44
  "num_input_tokens_seen": 0,
45
- "num_train_epochs": 10,
46
  "save_steps": 10,
47
  "stateful_callbacks": {
48
  "EarlyStoppingCallback": {
 
1
  {
2
+ "best_metric": 1.5362553596496582,
3
  "best_model_checkpoint": "/kaggle/working/checkpoint-20",
4
  "epoch": 2.2222222222222223,
5
  "eval_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
+ "grad_norm": 0.022457197308540344,
14
+ "learning_rate": 0.0001925925925925926,
15
+ "loss": 2.0406,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
+ "eval_loss": 1.729261875152588,
21
+ "eval_runtime": 34.8953,
22
+ "eval_samples_per_second": 1.032,
23
+ "eval_steps_per_second": 0.143,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 2.2222222222222223,
28
+ "grad_norm": 0.018787898123264313,
29
+ "learning_rate": 0.0001851851851851852,
30
+ "loss": 1.6016,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 2.2222222222222223,
35
+ "eval_loss": 1.5362553596496582,
36
+ "eval_runtime": 34.8752,
37
+ "eval_samples_per_second": 1.032,
38
+ "eval_steps_per_second": 0.143,
39
  "step": 20
40
  }
41
  ],
42
  "logging_steps": 10,
43
+ "max_steps": 270,
44
  "num_input_tokens_seen": 0,
45
+ "num_train_epochs": 30,
46
  "save_steps": 10,
47
  "stateful_callbacks": {
48
  "EarlyStoppingCallback": {
checkpoint-20/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f0053334cab1bfd2838c507b240d05093bc205b1d5fbe6dc54a022fef5dcaa7
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
3
  size 5112
checkpoint-30/adapter_config.json CHANGED
@@ -20,10 +20,10 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
  "q_proj",
25
- "o_proj",
26
- "v_proj"
 
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "q_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "o_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
checkpoint-30/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8576250f42c32085cdb174e306461292b115ea33d910d0a59d062fcad935bf0
3
  size 67143296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5afb85b43418de4f387e019cd5ff83db304f24c87600e2deb9b497bc225833e
3
  size 67143296
checkpoint-30/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16613c572dcb0ccca606ca4a382a4476b3f69ed3cf64a7095e7f852e897c8426
3
  size 134433530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9d88f9eafcc9de7708775af55f4954ceecfc02ad0285772a7592b7f07336a6c
3
  size 134433530
checkpoint-30/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14d970dabadfb95eaf7812b80cb7816a58d7911bb09df450b100b1c052b74a02
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78882a59797d983394328068e814e7aad08e194b72ebc7003618cfb9ff129ecf
3
  size 1064
checkpoint-30/trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 1.4295110702514648,
3
  "best_model_checkpoint": "/kaggle/working/checkpoint-30",
4
  "epoch": 3.3333333333333335,
5
  "eval_steps": 10,
@@ -10,54 +10,54 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
- "grad_norm": 0.02217627689242363,
14
- "learning_rate": 0.00017777777777777779,
15
- "loss": 2.0442,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
- "eval_loss": 1.737181544303894,
21
- "eval_runtime": 35.1318,
22
- "eval_samples_per_second": 1.025,
23
- "eval_steps_per_second": 0.142,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 2.2222222222222223,
28
- "grad_norm": 0.0346713550388813,
29
- "learning_rate": 0.00015555555555555556,
30
- "loss": 1.6131,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 2.2222222222222223,
35
- "eval_loss": 1.5489343404769897,
36
- "eval_runtime": 34.8402,
37
- "eval_samples_per_second": 1.033,
38
- "eval_steps_per_second": 0.144,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 3.3333333333333335,
43
- "grad_norm": 0.02501535415649414,
44
- "learning_rate": 0.00013333333333333334,
45
- "loss": 1.4152,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 3.3333333333333335,
50
- "eval_loss": 1.4295110702514648,
51
- "eval_runtime": 34.8537,
52
- "eval_samples_per_second": 1.033,
53
  "eval_steps_per_second": 0.143,
54
  "step": 30
55
  }
56
  ],
57
  "logging_steps": 10,
58
- "max_steps": 90,
59
  "num_input_tokens_seen": 0,
60
- "num_train_epochs": 10,
61
  "save_steps": 10,
62
  "stateful_callbacks": {
63
  "EarlyStoppingCallback": {
 
1
  {
2
+ "best_metric": 1.4144253730773926,
3
  "best_model_checkpoint": "/kaggle/working/checkpoint-30",
4
  "epoch": 3.3333333333333335,
5
  "eval_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
+ "grad_norm": 0.022457197308540344,
14
+ "learning_rate": 0.0001925925925925926,
15
+ "loss": 2.0406,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
+ "eval_loss": 1.729261875152588,
21
+ "eval_runtime": 34.8953,
22
+ "eval_samples_per_second": 1.032,
23
+ "eval_steps_per_second": 0.143,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 2.2222222222222223,
28
+ "grad_norm": 0.018787898123264313,
29
+ "learning_rate": 0.0001851851851851852,
30
+ "loss": 1.6016,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 2.2222222222222223,
35
+ "eval_loss": 1.5362553596496582,
36
+ "eval_runtime": 34.8752,
37
+ "eval_samples_per_second": 1.032,
38
+ "eval_steps_per_second": 0.143,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 3.3333333333333335,
43
+ "grad_norm": 0.021070128306746483,
44
+ "learning_rate": 0.00017777777777777779,
45
+ "loss": 1.3937,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 3.3333333333333335,
50
+ "eval_loss": 1.4144253730773926,
51
+ "eval_runtime": 34.9429,
52
+ "eval_samples_per_second": 1.03,
53
  "eval_steps_per_second": 0.143,
54
  "step": 30
55
  }
56
  ],
57
  "logging_steps": 10,
58
+ "max_steps": 270,
59
  "num_input_tokens_seen": 0,
60
+ "num_train_epochs": 30,
61
  "save_steps": 10,
62
  "stateful_callbacks": {
63
  "EarlyStoppingCallback": {
checkpoint-30/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f0053334cab1bfd2838c507b240d05093bc205b1d5fbe6dc54a022fef5dcaa7
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
3
  size 5112
checkpoint-40/adapter_config.json CHANGED
@@ -20,10 +20,10 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
  "q_proj",
25
- "o_proj",
26
- "v_proj"
 
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "q_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "o_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
checkpoint-40/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab38361a67b61947cafd5230ca79626082a1d26b72f5440faf199b3216bc6704
3
  size 67143296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c9e0b673563a407b5292005048d7f9e55e28f761356c6b7d865a6f14dbd4d1f
3
  size 67143296
checkpoint-40/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a38dd3cb56490e5a9b4d6a05ea97f3a761cd71841c3d9f7f129c1e4c0b4730f
3
  size 134433530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7af07c81b6d2394145239563c1fcabf8a96f4c073bcbe06adbe0e38de3e745d4
3
  size 134433530
checkpoint-40/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc5423f1af1182c2163f569e8f44b9ee18e1849c11acaaa76a185745ad274c02
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:441aaa824ef89ae7e5156933dc6dbe413f7c295a974e1ca6e7641ce94bd233fa
3
  size 1064
checkpoint-40/trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 1.3598744869232178,
3
  "best_model_checkpoint": "/kaggle/working/checkpoint-40",
4
  "epoch": 4.444444444444445,
5
  "eval_steps": 10,
@@ -10,69 +10,69 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
- "grad_norm": 0.02217627689242363,
14
- "learning_rate": 0.00017777777777777779,
15
- "loss": 2.0442,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
- "eval_loss": 1.737181544303894,
21
- "eval_runtime": 35.1318,
22
- "eval_samples_per_second": 1.025,
23
- "eval_steps_per_second": 0.142,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 2.2222222222222223,
28
- "grad_norm": 0.0346713550388813,
29
- "learning_rate": 0.00015555555555555556,
30
- "loss": 1.6131,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 2.2222222222222223,
35
- "eval_loss": 1.5489343404769897,
36
- "eval_runtime": 34.8402,
37
- "eval_samples_per_second": 1.033,
38
- "eval_steps_per_second": 0.144,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 3.3333333333333335,
43
- "grad_norm": 0.02501535415649414,
44
- "learning_rate": 0.00013333333333333334,
45
- "loss": 1.4152,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 3.3333333333333335,
50
- "eval_loss": 1.4295110702514648,
51
- "eval_runtime": 34.8537,
52
- "eval_samples_per_second": 1.033,
53
  "eval_steps_per_second": 0.143,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 4.444444444444445,
58
- "grad_norm": 0.02104916237294674,
59
- "learning_rate": 0.00011111111111111112,
60
- "loss": 1.3068,
61
  "step": 40
62
  },
63
  {
64
  "epoch": 4.444444444444445,
65
- "eval_loss": 1.3598744869232178,
66
- "eval_runtime": 35.0281,
67
- "eval_samples_per_second": 1.028,
68
  "eval_steps_per_second": 0.143,
69
  "step": 40
70
  }
71
  ],
72
  "logging_steps": 10,
73
- "max_steps": 90,
74
  "num_input_tokens_seen": 0,
75
- "num_train_epochs": 10,
76
  "save_steps": 10,
77
  "stateful_callbacks": {
78
  "EarlyStoppingCallback": {
 
1
  {
2
+ "best_metric": 1.3360365629196167,
3
  "best_model_checkpoint": "/kaggle/working/checkpoint-40",
4
  "epoch": 4.444444444444445,
5
  "eval_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 1.1111111111111112,
13
+ "grad_norm": 0.022457197308540344,
14
+ "learning_rate": 0.0001925925925925926,
15
+ "loss": 2.0406,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.1111111111111112,
20
+ "eval_loss": 1.729261875152588,
21
+ "eval_runtime": 34.8953,
22
+ "eval_samples_per_second": 1.032,
23
+ "eval_steps_per_second": 0.143,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 2.2222222222222223,
28
+ "grad_norm": 0.018787898123264313,
29
+ "learning_rate": 0.0001851851851851852,
30
+ "loss": 1.6016,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 2.2222222222222223,
35
+ "eval_loss": 1.5362553596496582,
36
+ "eval_runtime": 34.8752,
37
+ "eval_samples_per_second": 1.032,
38
+ "eval_steps_per_second": 0.143,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 3.3333333333333335,
43
+ "grad_norm": 0.021070128306746483,
44
+ "learning_rate": 0.00017777777777777779,
45
+ "loss": 1.3937,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 3.3333333333333335,
50
+ "eval_loss": 1.4144253730773926,
51
+ "eval_runtime": 34.9429,
52
+ "eval_samples_per_second": 1.03,
53
  "eval_steps_per_second": 0.143,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 4.444444444444445,
58
+ "grad_norm": 0.037991978228092194,
59
+ "learning_rate": 0.00017037037037037037,
60
+ "loss": 1.2721,
61
  "step": 40
62
  },
63
  {
64
  "epoch": 4.444444444444445,
65
+ "eval_loss": 1.3360365629196167,
66
+ "eval_runtime": 34.8947,
67
+ "eval_samples_per_second": 1.032,
68
  "eval_steps_per_second": 0.143,
69
  "step": 40
70
  }
71
  ],
72
  "logging_steps": 10,
73
+ "max_steps": 270,
74
  "num_input_tokens_seen": 0,
75
+ "num_train_epochs": 30,
76
  "save_steps": 10,
77
  "stateful_callbacks": {
78
  "EarlyStoppingCallback": {
checkpoint-40/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f0053334cab1bfd2838c507b240d05093bc205b1d5fbe6dc54a022fef5dcaa7
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
3
  size 5112
checkpoint-50/adapter_config.json CHANGED
@@ -20,10 +20,10 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
  "q_proj",
25
- "o_proj",
26
- "v_proj"
 
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "q_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "o_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
checkpoint-50/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b692c5f45a06d84947aef0a222d424aecd480e40aabcd9ca87aa5d3007aa46e8
3
  size 67143296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff24760e35eba65f38905cbc2d2b23ce73feb2da6a3bb98fa083cbd3cc564571
3
  size 67143296
checkpoint-50/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:44ab022dad22b0f149a3b1fb04e9cd79842aad48780ac055c542631a6fc57822
3
  size 134433530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98ab5b6b7c2997d594b9c7e48d2cd958c58dcdf4eaf60e9a5fb6497764869314
3
  size 134433530