diff --git a/adapter_config.json b/adapter_config.json index 539f4c41b2550fc30b9c2d0726f51adfa8e4b1e5..072f1947edd0ba30c6174f22b28e07cf76e90ee3 100644 --- a/adapter_config.json +++ b/adapter_config.json @@ -20,10 +20,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "k_proj", "q_proj", - "o_proj", - "v_proj" + "v_proj", + "k_proj", + "o_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/adapter_model.safetensors b/adapter_model.safetensors index aad60b0963bb0817ddff00ad978492cc2241a8a2..0f832e284c815c65e6a6d1276b6ea980e4569404 100644 --- a/adapter_model.safetensors +++ b/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bdfd134cf3b5e167c3aa127bf57024a3e8ff71b6b0ea16d5493a51a01d7e317 +oid sha256:f90c1d3ed853f5e7e29d9c0d39bdab0cc26bd4d4ea5fbb602291f4c783b23d04 size 67143296 diff --git a/checkpoint-10/adapter_config.json b/checkpoint-10/adapter_config.json index 539f4c41b2550fc30b9c2d0726f51adfa8e4b1e5..072f1947edd0ba30c6174f22b28e07cf76e90ee3 100644 --- a/checkpoint-10/adapter_config.json +++ b/checkpoint-10/adapter_config.json @@ -20,10 +20,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "k_proj", "q_proj", - "o_proj", - "v_proj" + "v_proj", + "k_proj", + "o_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/checkpoint-10/adapter_model.safetensors b/checkpoint-10/adapter_model.safetensors index 70f0dc5b3797e2589528793537c490c67c49b590..81f2bf4d8a7b5d6bd332921001a39ddbae233326 100644 --- a/checkpoint-10/adapter_model.safetensors +++ b/checkpoint-10/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e16adf919d93d6441c2583be16d89fc2157635291e0c18a1835380e4dd25668 +oid sha256:98e20323c453a19bc80c8d7d2e78dd31c294b2902e1e5c93d6fc3bb60807b9a9 size 67143296 diff --git a/checkpoint-10/optimizer.pt b/checkpoint-10/optimizer.pt index 2ea9b0e82ac9df07c8070110f0e9f7f510d13dbd..f4df365e5cbfd9cf8bbb3ef73f1b45eb34a70c2f 100644 --- a/checkpoint-10/optimizer.pt +++ b/checkpoint-10/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90d3b34bfbd9c3f0886fa09e483a0a1fa8853028f68e4ce50843d14911e15412 +oid sha256:6e8ca82274d47b8f0c29a9ec38dc99330e73ae5a77ba65837b1561b4da10e245 size 134433530 diff --git a/checkpoint-10/scheduler.pt b/checkpoint-10/scheduler.pt index 9b8b9b162c3bc0952535c6a06e25390cb24b5d5a..0f20a71cb5a363c483eba636e2d33117c55249a7 100644 --- a/checkpoint-10/scheduler.pt +++ b/checkpoint-10/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04d2341737bca7648a4cdb3a55768450f9758f2298ef492fe1db7f093eaa1902 +oid sha256:47a859c720f996d82c8b4e6126df0e86212eb2bb6933303af0eacc71bf5de32f size 1064 diff --git a/checkpoint-10/trainer_state.json b/checkpoint-10/trainer_state.json index c4d5e43fa9b77154cc4db4d15de86439adf67490..c6d68386651a9d64947671f412c0f4d5bdaded12 100644 --- a/checkpoint-10/trainer_state.json +++ b/checkpoint-10/trainer_state.json @@ -1,5 +1,5 @@ { - "best_metric": 1.737181544303894, + "best_metric": 1.729261875152588, "best_model_checkpoint": "/kaggle/working/checkpoint-10", "epoch": 1.1111111111111112, "eval_steps": 10, @@ -10,24 +10,24 @@ "log_history": [ { "epoch": 1.1111111111111112, - "grad_norm": 0.02217627689242363, - "learning_rate": 0.00017777777777777779, - "loss": 2.0442, + "grad_norm": 0.022457197308540344, + "learning_rate": 0.0001925925925925926, + "loss": 2.0406, "step": 10 }, { "epoch": 1.1111111111111112, - "eval_loss": 1.737181544303894, - "eval_runtime": 35.1318, - "eval_samples_per_second": 1.025, - "eval_steps_per_second": 0.142, + "eval_loss": 1.729261875152588, + "eval_runtime": 34.8953, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 10 } ], "logging_steps": 10, - "max_steps": 90, + "max_steps": 270, "num_input_tokens_seen": 0, - "num_train_epochs": 10, + "num_train_epochs": 30, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { diff --git a/checkpoint-10/training_args.bin b/checkpoint-10/training_args.bin index 992e364d4b54f32a399ec3cd5f5f54c212ea0588..89187d2cbb60b941ac70a59387377bf262ba730c 100644 --- a/checkpoint-10/training_args.bin +++ b/checkpoint-10/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f0053334cab1bfd2838c507b240d05093bc205b1d5fbe6dc54a022fef5dcaa7 +oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e size 5112 diff --git a/checkpoint-100/README.md b/checkpoint-100/README.md index 2d1596ffe16e4d5bdcdf0e1d4322e6667af95962..d94fa0fb36d5aa76962cf3fda3ca0bfe4c2fa517 100644 --- a/checkpoint-100/README.md +++ b/checkpoint-100/README.md @@ -1,6 +1,6 @@ --- -library_name: peft base_model: TheBloke/Llama-2-7B-fp16 +library_name: peft --- # Model Card for Model ID diff --git a/checkpoint-100/adapter_config.json b/checkpoint-100/adapter_config.json index cbf93f2809e43fe18fd6ad23406293a68e7f5c98..072f1947edd0ba30c6174f22b28e07cf76e90ee3 100644 --- a/checkpoint-100/adapter_config.json +++ b/checkpoint-100/adapter_config.json @@ -20,10 +20,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "k_proj", + "q_proj", "v_proj", - "o_proj", - "q_proj" + "k_proj", + "o_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/checkpoint-100/adapter_model.safetensors b/checkpoint-100/adapter_model.safetensors index 5474456dc9191482da68a63e25fc4b57741ddb66..f7bcbdd0fa735f91b9a9170ba28a7d6e5d9bbb1d 100644 --- a/checkpoint-100/adapter_model.safetensors +++ b/checkpoint-100/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac3b59826a91a331332b5850491ffec38f48afde058dead68205fb9903924aac +oid sha256:1d0047624e79262540578984e129869e2d6934ddd722dbc7dfc5f942e628b000 size 67143296 diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt index ed26cd0b52ff12b00beef892c62747154ff73280..4802380930a8f60bf6b1c4df02a58d48598c9196 100644 --- a/checkpoint-100/optimizer.pt +++ b/checkpoint-100/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49ff3c81928bee92eb73c79e2f6088612cc35f2fab427d0f73ba21269e3c8085 +oid sha256:df94a2231d5455b24c3ce8db2477572330be1aa99e33c02b4ac96351f08c9fc8 size 134433530 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt index 4edc88d9af1e5877cdf4a912cba5fd5dd30760f3..ead5712211d0b25d6379fc8d96939bc766111808 100644 --- a/checkpoint-100/scheduler.pt +++ b/checkpoint-100/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a67071c9831c7625e547eac0c0538006ee7fe06d1b1052844fd1cdb5172b8b9f +oid sha256:127e093ac25e89499f96e10a77287e7041566fb667c0634628ee414b8d0443ea size 1064 diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json index 7c2f7daa1437992d332458338001a59717ed5348..a626e073a3d50d962a6cc65f008002f6559af069 100644 --- a/checkpoint-100/trainer_state.json +++ b/checkpoint-100/trainer_state.json @@ -1,5 +1,5 @@ { - "best_metric": 1.2115424871444702, + "best_metric": 1.173593521118164, "best_model_checkpoint": "/kaggle/working/checkpoint-90", "epoch": 11.11111111111111, "eval_steps": 10, @@ -10,159 +10,159 @@ "log_history": [ { "epoch": 1.1111111111111112, - "grad_norm": 0.022282764315605164, - "learning_rate": 0.0001851851851851852, - "loss": 2.0424, + "grad_norm": 0.022457197308540344, + "learning_rate": 0.0001925925925925926, + "loss": 2.0406, "step": 10 }, { "epoch": 1.1111111111111112, - "eval_loss": 1.733155369758606, - "eval_runtime": 34.5543, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.729261875152588, + "eval_runtime": 34.8953, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 10 }, { "epoch": 2.2222222222222223, - "grad_norm": 0.018981408327817917, - "learning_rate": 0.00017037037037037037, - "loss": 1.6072, + "grad_norm": 0.018787898123264313, + "learning_rate": 0.0001851851851851852, + "loss": 1.6016, "step": 20 }, { "epoch": 2.2222222222222223, - "eval_loss": 1.5428930521011353, - "eval_runtime": 34.6485, - "eval_samples_per_second": 1.039, - "eval_steps_per_second": 0.144, + "eval_loss": 1.5362553596496582, + "eval_runtime": 34.8752, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 20 }, { "epoch": 3.3333333333333335, - "grad_norm": 0.023157037794589996, - "learning_rate": 0.00015555555555555556, - "loss": 1.4025, + "grad_norm": 0.021070128306746483, + "learning_rate": 0.00017777777777777779, + "loss": 1.3937, "step": 30 }, { "epoch": 3.3333333333333335, - "eval_loss": 1.4176721572875977, - "eval_runtime": 34.5433, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.4144253730773926, + "eval_runtime": 34.9429, + "eval_samples_per_second": 1.03, + "eval_steps_per_second": 0.143, "step": 30 }, { "epoch": 4.444444444444445, - "grad_norm": 0.021338749676942825, - "learning_rate": 0.00014074074074074076, - "loss": 1.285, + "grad_norm": 0.037991978228092194, + "learning_rate": 0.00017037037037037037, + "loss": 1.2721, "step": 40 }, { "epoch": 4.444444444444445, - "eval_loss": 1.3449772596359253, - "eval_runtime": 34.5594, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.3360365629196167, + "eval_runtime": 34.8947, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 40 }, { "epoch": 5.555555555555555, - "grad_norm": 0.02489505708217621, - "learning_rate": 0.00012592592592592592, - "loss": 1.1687, + "grad_norm": 0.029117526486516, + "learning_rate": 0.00016296296296296295, + "loss": 1.1384, "step": 50 }, { "epoch": 5.555555555555555, - "eval_loss": 1.2951068878173828, - "eval_runtime": 34.5896, - "eval_samples_per_second": 1.041, - "eval_steps_per_second": 0.145, + "eval_loss": 1.2785382270812988, + "eval_runtime": 34.8447, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 50 }, { "epoch": 6.666666666666667, - "grad_norm": 0.028962766751646996, - "learning_rate": 0.00011111111111111112, - "loss": 1.0521, + "grad_norm": 0.0317281112074852, + "learning_rate": 0.00015555555555555556, + "loss": 1.0023, "step": 60 }, { "epoch": 6.666666666666667, - "eval_loss": 1.2674343585968018, - "eval_runtime": 34.5586, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.2417998313903809, + "eval_runtime": 34.8141, + "eval_samples_per_second": 1.034, + "eval_steps_per_second": 0.144, "step": 60 }, { "epoch": 7.777777777777778, - "grad_norm": 0.033917125314474106, - "learning_rate": 9.62962962962963e-05, - "loss": 0.9885, + "grad_norm": 0.034914035350084305, + "learning_rate": 0.00014814814814814815, + "loss": 0.9166, "step": 70 }, { "epoch": 7.777777777777778, - "eval_loss": 1.2424466609954834, - "eval_runtime": 34.5412, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.2166908979415894, + "eval_runtime": 34.8956, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 70 }, { "epoch": 8.88888888888889, - "grad_norm": 0.03393130004405975, - "learning_rate": 8.148148148148148e-05, - "loss": 0.8784, + "grad_norm": 0.04872061312198639, + "learning_rate": 0.00014074074074074076, + "loss": 0.7726, "step": 80 }, { "epoch": 8.88888888888889, - "eval_loss": 1.2252851724624634, - "eval_runtime": 34.58, - "eval_samples_per_second": 1.041, - "eval_steps_per_second": 0.145, + "eval_loss": 1.19890296459198, + "eval_runtime": 34.8433, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 80 }, { "epoch": 10.0, - "grad_norm": 0.04081139340996742, - "learning_rate": 6.666666666666667e-05, - "loss": 0.8154, + "grad_norm": 0.04901803284883499, + "learning_rate": 0.00013333333333333334, + "loss": 0.676, "step": 90 }, { "epoch": 10.0, - "eval_loss": 1.2115424871444702, - "eval_runtime": 34.5784, - "eval_samples_per_second": 1.041, - "eval_steps_per_second": 0.145, + "eval_loss": 1.173593521118164, + "eval_runtime": 34.7999, + "eval_samples_per_second": 1.034, + "eval_steps_per_second": 0.144, "step": 90 }, { "epoch": 11.11111111111111, - "grad_norm": 0.04114004969596863, - "learning_rate": 5.185185185185185e-05, - "loss": 0.7376, + "grad_norm": 0.055481575429439545, + "learning_rate": 0.00012592592592592592, + "loss": 0.56, "step": 100 }, { "epoch": 11.11111111111111, - "eval_loss": 1.2147088050842285, - "eval_runtime": 34.595, - "eval_samples_per_second": 1.041, - "eval_steps_per_second": 0.145, + "eval_loss": 1.2059063911437988, + "eval_runtime": 34.8432, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 100 } ], "logging_steps": 10, - "max_steps": 135, + "max_steps": 270, "num_input_tokens_seen": 0, - "num_train_epochs": 15, + "num_train_epochs": 30, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin index db8c5d32a4f54d0e9ab4e7a985fbaee5d6701ecc..89187d2cbb60b941ac70a59387377bf262ba730c 100644 --- a/checkpoint-100/training_args.bin +++ b/checkpoint-100/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79c753ff1ba946038f620bad3e42a35ce583c9e8ed52b49fd22fb6614fea0f43 +oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e size 5112 diff --git a/checkpoint-110/README.md b/checkpoint-110/README.md index 2d1596ffe16e4d5bdcdf0e1d4322e6667af95962..d94fa0fb36d5aa76962cf3fda3ca0bfe4c2fa517 100644 --- a/checkpoint-110/README.md +++ b/checkpoint-110/README.md @@ -1,6 +1,6 @@ --- -library_name: peft base_model: TheBloke/Llama-2-7B-fp16 +library_name: peft --- # Model Card for Model ID diff --git a/checkpoint-110/adapter_config.json b/checkpoint-110/adapter_config.json index cbf93f2809e43fe18fd6ad23406293a68e7f5c98..072f1947edd0ba30c6174f22b28e07cf76e90ee3 100644 --- a/checkpoint-110/adapter_config.json +++ b/checkpoint-110/adapter_config.json @@ -20,10 +20,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "k_proj", + "q_proj", "v_proj", - "o_proj", - "q_proj" + "k_proj", + "o_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/checkpoint-110/adapter_model.safetensors b/checkpoint-110/adapter_model.safetensors index 7f605b66a87175ab6628e4ed30bb65e58cfd30b5..f52c19d604f51615eb0a5bb2bb66f41242a9c4e3 100644 --- a/checkpoint-110/adapter_model.safetensors +++ b/checkpoint-110/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be7b95474c7a25c6961db7fe4913e88e0e78819b321a21a383b179782c22ef6c +oid sha256:144bb4cb915061effe137c30fcd2897134d3bad9790d3265733214a882cd96fa size 67143296 diff --git a/checkpoint-110/optimizer.pt b/checkpoint-110/optimizer.pt index 128e5885bb3a9ff0f3a9e5377049be1f310e384b..6213aabdfbc53fe82f9189b8cfd363d2e51279e1 100644 --- a/checkpoint-110/optimizer.pt +++ b/checkpoint-110/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:567bf2cdba466fb3da7301567fc6eee0fd77e99808ab9402911468a94017eb0a +oid sha256:8e75cb352b366f1f8dcb73f6cbcd9937088a39b25a9a826d16c8594e055eea58 size 134433530 diff --git a/checkpoint-110/scheduler.pt b/checkpoint-110/scheduler.pt index 70cd77658255ba7e2dd9a0fdcb8ac3766d0b1df3..ddf5bccd5085995cdc0a3329e726ce9a7e93f2ec 100644 --- a/checkpoint-110/scheduler.pt +++ b/checkpoint-110/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c7403172deaf546b51410d54cab636a5c53264be17aa6a439e5934523944587 +oid sha256:2ed11929090acc3d040cabb379d312daf9924e0e46fa0a1c8884a63973944e92 size 1064 diff --git a/checkpoint-110/trainer_state.json b/checkpoint-110/trainer_state.json index 3cfcc9fcbe02f92fe34d4c23ddc019a88d9a5400..32c1e05c26cef871144324e00b642cae634c68ca 100644 --- a/checkpoint-110/trainer_state.json +++ b/checkpoint-110/trainer_state.json @@ -1,5 +1,5 @@ { - "best_metric": 1.2115424871444702, + "best_metric": 1.173593521118164, "best_model_checkpoint": "/kaggle/working/checkpoint-90", "epoch": 12.222222222222221, "eval_steps": 10, @@ -10,174 +10,174 @@ "log_history": [ { "epoch": 1.1111111111111112, - "grad_norm": 0.022282764315605164, - "learning_rate": 0.0001851851851851852, - "loss": 2.0424, + "grad_norm": 0.022457197308540344, + "learning_rate": 0.0001925925925925926, + "loss": 2.0406, "step": 10 }, { "epoch": 1.1111111111111112, - "eval_loss": 1.733155369758606, - "eval_runtime": 34.5543, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.729261875152588, + "eval_runtime": 34.8953, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 10 }, { "epoch": 2.2222222222222223, - "grad_norm": 0.018981408327817917, - "learning_rate": 0.00017037037037037037, - "loss": 1.6072, + "grad_norm": 0.018787898123264313, + "learning_rate": 0.0001851851851851852, + "loss": 1.6016, "step": 20 }, { "epoch": 2.2222222222222223, - "eval_loss": 1.5428930521011353, - "eval_runtime": 34.6485, - "eval_samples_per_second": 1.039, - "eval_steps_per_second": 0.144, + "eval_loss": 1.5362553596496582, + "eval_runtime": 34.8752, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 20 }, { "epoch": 3.3333333333333335, - "grad_norm": 0.023157037794589996, - "learning_rate": 0.00015555555555555556, - "loss": 1.4025, + "grad_norm": 0.021070128306746483, + "learning_rate": 0.00017777777777777779, + "loss": 1.3937, "step": 30 }, { "epoch": 3.3333333333333335, - "eval_loss": 1.4176721572875977, - "eval_runtime": 34.5433, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.4144253730773926, + "eval_runtime": 34.9429, + "eval_samples_per_second": 1.03, + "eval_steps_per_second": 0.143, "step": 30 }, { "epoch": 4.444444444444445, - "grad_norm": 0.021338749676942825, - "learning_rate": 0.00014074074074074076, - "loss": 1.285, + "grad_norm": 0.037991978228092194, + "learning_rate": 0.00017037037037037037, + "loss": 1.2721, "step": 40 }, { "epoch": 4.444444444444445, - "eval_loss": 1.3449772596359253, - "eval_runtime": 34.5594, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.3360365629196167, + "eval_runtime": 34.8947, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 40 }, { "epoch": 5.555555555555555, - "grad_norm": 0.02489505708217621, - "learning_rate": 0.00012592592592592592, - "loss": 1.1687, + "grad_norm": 0.029117526486516, + "learning_rate": 0.00016296296296296295, + "loss": 1.1384, "step": 50 }, { "epoch": 5.555555555555555, - "eval_loss": 1.2951068878173828, - "eval_runtime": 34.5896, - "eval_samples_per_second": 1.041, - "eval_steps_per_second": 0.145, + "eval_loss": 1.2785382270812988, + "eval_runtime": 34.8447, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 50 }, { "epoch": 6.666666666666667, - "grad_norm": 0.028962766751646996, - "learning_rate": 0.00011111111111111112, - "loss": 1.0521, + "grad_norm": 0.0317281112074852, + "learning_rate": 0.00015555555555555556, + "loss": 1.0023, "step": 60 }, { "epoch": 6.666666666666667, - "eval_loss": 1.2674343585968018, - "eval_runtime": 34.5586, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.2417998313903809, + "eval_runtime": 34.8141, + "eval_samples_per_second": 1.034, + "eval_steps_per_second": 0.144, "step": 60 }, { "epoch": 7.777777777777778, - "grad_norm": 0.033917125314474106, - "learning_rate": 9.62962962962963e-05, - "loss": 0.9885, + "grad_norm": 0.034914035350084305, + "learning_rate": 0.00014814814814814815, + "loss": 0.9166, "step": 70 }, { "epoch": 7.777777777777778, - "eval_loss": 1.2424466609954834, - "eval_runtime": 34.5412, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.2166908979415894, + "eval_runtime": 34.8956, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 70 }, { "epoch": 8.88888888888889, - "grad_norm": 0.03393130004405975, - "learning_rate": 8.148148148148148e-05, - "loss": 0.8784, + "grad_norm": 0.04872061312198639, + "learning_rate": 0.00014074074074074076, + "loss": 0.7726, "step": 80 }, { "epoch": 8.88888888888889, - "eval_loss": 1.2252851724624634, - "eval_runtime": 34.58, - "eval_samples_per_second": 1.041, - "eval_steps_per_second": 0.145, + "eval_loss": 1.19890296459198, + "eval_runtime": 34.8433, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 80 }, { "epoch": 10.0, - "grad_norm": 0.04081139340996742, - "learning_rate": 6.666666666666667e-05, - "loss": 0.8154, + "grad_norm": 0.04901803284883499, + "learning_rate": 0.00013333333333333334, + "loss": 0.676, "step": 90 }, { "epoch": 10.0, - "eval_loss": 1.2115424871444702, - "eval_runtime": 34.5784, - "eval_samples_per_second": 1.041, - "eval_steps_per_second": 0.145, + "eval_loss": 1.173593521118164, + "eval_runtime": 34.7999, + "eval_samples_per_second": 1.034, + "eval_steps_per_second": 0.144, "step": 90 }, { "epoch": 11.11111111111111, - "grad_norm": 0.04114004969596863, - "learning_rate": 5.185185185185185e-05, - "loss": 0.7376, + "grad_norm": 0.055481575429439545, + "learning_rate": 0.00012592592592592592, + "loss": 0.56, "step": 100 }, { "epoch": 11.11111111111111, - "eval_loss": 1.2147088050842285, - "eval_runtime": 34.595, - "eval_samples_per_second": 1.041, - "eval_steps_per_second": 0.145, + "eval_loss": 1.2059063911437988, + "eval_runtime": 34.8432, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 100 }, { "epoch": 12.222222222222221, - "grad_norm": 0.04217207431793213, - "learning_rate": 3.7037037037037037e-05, - "loss": 0.6642, + "grad_norm": 0.0524757020175457, + "learning_rate": 0.00011851851851851852, + "loss": 0.4567, "step": 110 }, { "epoch": 12.222222222222221, - "eval_loss": 1.2141155004501343, - "eval_runtime": 34.6053, - "eval_samples_per_second": 1.04, + "eval_loss": 1.2077444791793823, + "eval_runtime": 34.7989, + "eval_samples_per_second": 1.035, "eval_steps_per_second": 0.144, "step": 110 } ], "logging_steps": 10, - "max_steps": 135, + "max_steps": 270, "num_input_tokens_seen": 0, - "num_train_epochs": 15, + "num_train_epochs": 30, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { diff --git a/checkpoint-110/training_args.bin b/checkpoint-110/training_args.bin index db8c5d32a4f54d0e9ab4e7a985fbaee5d6701ecc..89187d2cbb60b941ac70a59387377bf262ba730c 100644 --- a/checkpoint-110/training_args.bin +++ b/checkpoint-110/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79c753ff1ba946038f620bad3e42a35ce583c9e8ed52b49fd22fb6614fea0f43 +oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e size 5112 diff --git a/checkpoint-120/README.md b/checkpoint-120/README.md index 2d1596ffe16e4d5bdcdf0e1d4322e6667af95962..d94fa0fb36d5aa76962cf3fda3ca0bfe4c2fa517 100644 --- a/checkpoint-120/README.md +++ b/checkpoint-120/README.md @@ -1,6 +1,6 @@ --- -library_name: peft base_model: TheBloke/Llama-2-7B-fp16 +library_name: peft --- # Model Card for Model ID diff --git a/checkpoint-120/adapter_config.json b/checkpoint-120/adapter_config.json index cbf93f2809e43fe18fd6ad23406293a68e7f5c98..072f1947edd0ba30c6174f22b28e07cf76e90ee3 100644 --- a/checkpoint-120/adapter_config.json +++ b/checkpoint-120/adapter_config.json @@ -20,10 +20,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "k_proj", + "q_proj", "v_proj", - "o_proj", - "q_proj" + "k_proj", + "o_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/checkpoint-120/adapter_model.safetensors b/checkpoint-120/adapter_model.safetensors index b0331d3326f9fe8e6101bed734cda4120704beb5..e99918ef15b52428a0521a090d402314214bf65a 100644 --- a/checkpoint-120/adapter_model.safetensors +++ b/checkpoint-120/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb1690155a5e4a4452a0e292d686003b08ac457e37c20099c948aae55ca8e453 +oid sha256:173890a91bffdb964ce5f909803d70349ae54ba2275b81eaf33d7e10b02d2a18 size 67143296 diff --git a/checkpoint-120/optimizer.pt b/checkpoint-120/optimizer.pt index 92ef0684e214370f01a6b0138c929044159067b4..bec1477ece9212a5964f204a5f7a62a2b4e4e2cc 100644 --- a/checkpoint-120/optimizer.pt +++ b/checkpoint-120/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14cc791678f8ce58b5f8b40f05113973aa7f94325c356b74d4df44ff8c1a956a +oid sha256:50d2389c285ac068205e38a94cd027c8b55b17736442e923e49875d92296c9dd size 134433530 diff --git a/checkpoint-120/scheduler.pt b/checkpoint-120/scheduler.pt index 1d230859237cac17ad8b06f8e53289cfacddd780..080b81898403330d87e51aba2ecc6d8d29004486 100644 --- a/checkpoint-120/scheduler.pt +++ b/checkpoint-120/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7846da062f26e398f1065295f9c7cbaf4768d3aa6b5518863ce89b7eb9d328e +oid sha256:60da1fb7525eb9f93843e0f6cf6e45c012533f0f97597344050ff835287f782f size 1064 diff --git a/checkpoint-120/trainer_state.json b/checkpoint-120/trainer_state.json index 1d0d62df39a8ef027f9eb6cb16e365592f41f1b1..5a356db11742fef8ce1d231410c531870e226d13 100644 --- a/checkpoint-120/trainer_state.json +++ b/checkpoint-120/trainer_state.json @@ -1,6 +1,6 @@ { - "best_metric": 1.20501708984375, - "best_model_checkpoint": "/kaggle/working/checkpoint-120", + "best_metric": 1.173593521118164, + "best_model_checkpoint": "/kaggle/working/checkpoint-90", "epoch": 13.333333333333334, "eval_steps": 10, "global_step": 120, @@ -10,189 +10,189 @@ "log_history": [ { "epoch": 1.1111111111111112, - "grad_norm": 0.022282764315605164, - "learning_rate": 0.0001851851851851852, - "loss": 2.0424, + "grad_norm": 0.022457197308540344, + "learning_rate": 0.0001925925925925926, + "loss": 2.0406, "step": 10 }, { "epoch": 1.1111111111111112, - "eval_loss": 1.733155369758606, - "eval_runtime": 34.5543, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.729261875152588, + "eval_runtime": 34.8953, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 10 }, { "epoch": 2.2222222222222223, - "grad_norm": 0.018981408327817917, - "learning_rate": 0.00017037037037037037, - "loss": 1.6072, + "grad_norm": 0.018787898123264313, + "learning_rate": 0.0001851851851851852, + "loss": 1.6016, "step": 20 }, { "epoch": 2.2222222222222223, - "eval_loss": 1.5428930521011353, - "eval_runtime": 34.6485, - "eval_samples_per_second": 1.039, - "eval_steps_per_second": 0.144, + "eval_loss": 1.5362553596496582, + "eval_runtime": 34.8752, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 20 }, { "epoch": 3.3333333333333335, - "grad_norm": 0.023157037794589996, - "learning_rate": 0.00015555555555555556, - "loss": 1.4025, + "grad_norm": 0.021070128306746483, + "learning_rate": 0.00017777777777777779, + "loss": 1.3937, "step": 30 }, { "epoch": 3.3333333333333335, - "eval_loss": 1.4176721572875977, - "eval_runtime": 34.5433, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.4144253730773926, + "eval_runtime": 34.9429, + "eval_samples_per_second": 1.03, + "eval_steps_per_second": 0.143, "step": 30 }, { "epoch": 4.444444444444445, - "grad_norm": 0.021338749676942825, - "learning_rate": 0.00014074074074074076, - "loss": 1.285, + "grad_norm": 0.037991978228092194, + "learning_rate": 0.00017037037037037037, + "loss": 1.2721, "step": 40 }, { "epoch": 4.444444444444445, - "eval_loss": 1.3449772596359253, - "eval_runtime": 34.5594, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.3360365629196167, + "eval_runtime": 34.8947, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 40 }, { "epoch": 5.555555555555555, - "grad_norm": 0.02489505708217621, - "learning_rate": 0.00012592592592592592, - "loss": 1.1687, + "grad_norm": 0.029117526486516, + "learning_rate": 0.00016296296296296295, + "loss": 1.1384, "step": 50 }, { "epoch": 5.555555555555555, - "eval_loss": 1.2951068878173828, - "eval_runtime": 34.5896, - "eval_samples_per_second": 1.041, - "eval_steps_per_second": 0.145, + "eval_loss": 1.2785382270812988, + "eval_runtime": 34.8447, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 50 }, { "epoch": 6.666666666666667, - "grad_norm": 0.028962766751646996, - "learning_rate": 0.00011111111111111112, - "loss": 1.0521, + "grad_norm": 0.0317281112074852, + "learning_rate": 0.00015555555555555556, + "loss": 1.0023, "step": 60 }, { "epoch": 6.666666666666667, - "eval_loss": 1.2674343585968018, - "eval_runtime": 34.5586, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.2417998313903809, + "eval_runtime": 34.8141, + "eval_samples_per_second": 1.034, + "eval_steps_per_second": 0.144, "step": 60 }, { "epoch": 7.777777777777778, - "grad_norm": 0.033917125314474106, - "learning_rate": 9.62962962962963e-05, - "loss": 0.9885, + "grad_norm": 0.034914035350084305, + "learning_rate": 0.00014814814814814815, + "loss": 0.9166, "step": 70 }, { "epoch": 7.777777777777778, - "eval_loss": 1.2424466609954834, - "eval_runtime": 34.5412, - "eval_samples_per_second": 1.042, - "eval_steps_per_second": 0.145, + "eval_loss": 1.2166908979415894, + "eval_runtime": 34.8956, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 70 }, { "epoch": 8.88888888888889, - "grad_norm": 0.03393130004405975, - "learning_rate": 8.148148148148148e-05, - "loss": 0.8784, + "grad_norm": 0.04872061312198639, + "learning_rate": 0.00014074074074074076, + "loss": 0.7726, "step": 80 }, { "epoch": 8.88888888888889, - "eval_loss": 1.2252851724624634, - "eval_runtime": 34.58, - "eval_samples_per_second": 1.041, - "eval_steps_per_second": 0.145, + "eval_loss": 1.19890296459198, + "eval_runtime": 34.8433, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 80 }, { "epoch": 10.0, - "grad_norm": 0.04081139340996742, - "learning_rate": 6.666666666666667e-05, - "loss": 0.8154, + "grad_norm": 0.04901803284883499, + "learning_rate": 0.00013333333333333334, + "loss": 0.676, "step": 90 }, { "epoch": 10.0, - "eval_loss": 1.2115424871444702, - "eval_runtime": 34.5784, - "eval_samples_per_second": 1.041, - "eval_steps_per_second": 0.145, + "eval_loss": 1.173593521118164, + "eval_runtime": 34.7999, + "eval_samples_per_second": 1.034, + "eval_steps_per_second": 0.144, "step": 90 }, { "epoch": 11.11111111111111, - "grad_norm": 0.04114004969596863, - "learning_rate": 5.185185185185185e-05, - "loss": 0.7376, + "grad_norm": 0.055481575429439545, + "learning_rate": 0.00012592592592592592, + "loss": 0.56, "step": 100 }, { "epoch": 11.11111111111111, - "eval_loss": 1.2147088050842285, - "eval_runtime": 34.595, - "eval_samples_per_second": 1.041, - "eval_steps_per_second": 0.145, + "eval_loss": 1.2059063911437988, + "eval_runtime": 34.8432, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 100 }, { "epoch": 12.222222222222221, - "grad_norm": 0.04217207431793213, - "learning_rate": 3.7037037037037037e-05, - "loss": 0.6642, + "grad_norm": 0.0524757020175457, + "learning_rate": 0.00011851851851851852, + "loss": 0.4567, "step": 110 }, { "epoch": 12.222222222222221, - "eval_loss": 1.2141155004501343, - "eval_runtime": 34.6053, - "eval_samples_per_second": 1.04, + "eval_loss": 1.2077444791793823, + "eval_runtime": 34.7989, + "eval_samples_per_second": 1.035, "eval_steps_per_second": 0.144, "step": 110 }, { "epoch": 13.333333333333334, - "grad_norm": 0.04223904013633728, - "learning_rate": 2.2222222222222223e-05, - "loss": 0.6353, + "grad_norm": 0.053020887076854706, + "learning_rate": 0.00011111111111111112, + "loss": 0.3915, "step": 120 }, { "epoch": 13.333333333333334, - "eval_loss": 1.20501708984375, - "eval_runtime": 34.6447, - "eval_samples_per_second": 1.039, + "eval_loss": 1.2036480903625488, + "eval_runtime": 34.802, + "eval_samples_per_second": 1.034, "eval_steps_per_second": 0.144, "step": 120 } ], "logging_steps": 10, - "max_steps": 135, + "max_steps": 270, "num_input_tokens_seen": 0, - "num_train_epochs": 15, + "num_train_epochs": 30, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { @@ -210,7 +210,7 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } diff --git a/checkpoint-120/training_args.bin b/checkpoint-120/training_args.bin index db8c5d32a4f54d0e9ab4e7a985fbaee5d6701ecc..89187d2cbb60b941ac70a59387377bf262ba730c 100644 --- a/checkpoint-120/training_args.bin +++ b/checkpoint-120/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79c753ff1ba946038f620bad3e42a35ce583c9e8ed52b49fd22fb6614fea0f43 +oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e size 5112 diff --git a/checkpoint-20/adapter_config.json b/checkpoint-20/adapter_config.json index 539f4c41b2550fc30b9c2d0726f51adfa8e4b1e5..072f1947edd0ba30c6174f22b28e07cf76e90ee3 100644 --- a/checkpoint-20/adapter_config.json +++ b/checkpoint-20/adapter_config.json @@ -20,10 +20,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "k_proj", "q_proj", - "o_proj", - "v_proj" + "v_proj", + "k_proj", + "o_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/checkpoint-20/adapter_model.safetensors b/checkpoint-20/adapter_model.safetensors index 13aa561884e1ba48ec5d29e6a7bed5a31855eed2..9c6b58a799bedae716e487e094796c0fc831c9ca 100644 --- a/checkpoint-20/adapter_model.safetensors +++ b/checkpoint-20/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5ab93af4ed23c52b82729ff3b3f871c19b732c90f1094f90d5a9f4ade1ccfac +oid sha256:e350a043d6e188be3930ae109597f7418b1c57332d8722c377acf61b839280db size 67143296 diff --git a/checkpoint-20/optimizer.pt b/checkpoint-20/optimizer.pt index 0c81c05ac0b1ac814e5ec818a8b5f47ab91bcf72..c35f74b8d68706df39db85f70fc1f6015f3fb297 100644 --- a/checkpoint-20/optimizer.pt +++ b/checkpoint-20/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a07f3020a10b2a9d3e215c9651b159e8c3b297ab1db69b013b8c7817d5f52a7c +oid sha256:53aaa85798d6640d73ad6607ea99ffb3eee0b87eb9130f5f653e9d52f119e393 size 134433530 diff --git a/checkpoint-20/scheduler.pt b/checkpoint-20/scheduler.pt index 2c4b1ba5f9c4137961a8e1182dbd5a6a94845c4c..afc0a3892b54f4a3e3431867996dea7bcb0b1195 100644 --- a/checkpoint-20/scheduler.pt +++ b/checkpoint-20/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9e7e75183c7081ca7f8f52ddfd0d5f4b8e8dbcf7f7bcd495fc6e0cfff80e3a2 +oid sha256:1c05a2d1065158cc7891c3a0806de9d2368277087e1ac23c872a14cc5ce6a082 size 1064 diff --git a/checkpoint-20/trainer_state.json b/checkpoint-20/trainer_state.json index 27994450f65cba3cf56c1c4c543734709d21042d..001230837638c4ee638259b81f238a37095de756 100644 --- a/checkpoint-20/trainer_state.json +++ b/checkpoint-20/trainer_state.json @@ -1,5 +1,5 @@ { - "best_metric": 1.5489343404769897, + "best_metric": 1.5362553596496582, "best_model_checkpoint": "/kaggle/working/checkpoint-20", "epoch": 2.2222222222222223, "eval_steps": 10, @@ -10,39 +10,39 @@ "log_history": [ { "epoch": 1.1111111111111112, - "grad_norm": 0.02217627689242363, - "learning_rate": 0.00017777777777777779, - "loss": 2.0442, + "grad_norm": 0.022457197308540344, + "learning_rate": 0.0001925925925925926, + "loss": 2.0406, "step": 10 }, { "epoch": 1.1111111111111112, - "eval_loss": 1.737181544303894, - "eval_runtime": 35.1318, - "eval_samples_per_second": 1.025, - "eval_steps_per_second": 0.142, + "eval_loss": 1.729261875152588, + "eval_runtime": 34.8953, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 10 }, { "epoch": 2.2222222222222223, - "grad_norm": 0.0346713550388813, - "learning_rate": 0.00015555555555555556, - "loss": 1.6131, + "grad_norm": 0.018787898123264313, + "learning_rate": 0.0001851851851851852, + "loss": 1.6016, "step": 20 }, { "epoch": 2.2222222222222223, - "eval_loss": 1.5489343404769897, - "eval_runtime": 34.8402, - "eval_samples_per_second": 1.033, - "eval_steps_per_second": 0.144, + "eval_loss": 1.5362553596496582, + "eval_runtime": 34.8752, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 20 } ], "logging_steps": 10, - "max_steps": 90, + "max_steps": 270, "num_input_tokens_seen": 0, - "num_train_epochs": 10, + "num_train_epochs": 30, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { diff --git a/checkpoint-20/training_args.bin b/checkpoint-20/training_args.bin index 992e364d4b54f32a399ec3cd5f5f54c212ea0588..89187d2cbb60b941ac70a59387377bf262ba730c 100644 --- a/checkpoint-20/training_args.bin +++ b/checkpoint-20/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f0053334cab1bfd2838c507b240d05093bc205b1d5fbe6dc54a022fef5dcaa7 +oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e size 5112 diff --git a/checkpoint-30/adapter_config.json b/checkpoint-30/adapter_config.json index 539f4c41b2550fc30b9c2d0726f51adfa8e4b1e5..072f1947edd0ba30c6174f22b28e07cf76e90ee3 100644 --- a/checkpoint-30/adapter_config.json +++ b/checkpoint-30/adapter_config.json @@ -20,10 +20,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "k_proj", "q_proj", - "o_proj", - "v_proj" + "v_proj", + "k_proj", + "o_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/checkpoint-30/adapter_model.safetensors b/checkpoint-30/adapter_model.safetensors index c985a80922131720df8cf9d17f8535245bba0c81..366b39d84b7090fd536abae8cb03373cdeec7974 100644 --- a/checkpoint-30/adapter_model.safetensors +++ b/checkpoint-30/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8576250f42c32085cdb174e306461292b115ea33d910d0a59d062fcad935bf0 +oid sha256:e5afb85b43418de4f387e019cd5ff83db304f24c87600e2deb9b497bc225833e size 67143296 diff --git a/checkpoint-30/optimizer.pt b/checkpoint-30/optimizer.pt index cfc1988c5683d4e48dbf08fb68442468c0e29d45..9deebbe1018e0ac90a144716c45a1bef9258e324 100644 --- a/checkpoint-30/optimizer.pt +++ b/checkpoint-30/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16613c572dcb0ccca606ca4a382a4476b3f69ed3cf64a7095e7f852e897c8426 +oid sha256:d9d88f9eafcc9de7708775af55f4954ceecfc02ad0285772a7592b7f07336a6c size 134433530 diff --git a/checkpoint-30/scheduler.pt b/checkpoint-30/scheduler.pt index b5d0f2ca2c7bb5261cdc0c82aa8b49d242267e6b..96349016d5a1ec1bba5b53b16acc868a080b540e 100644 --- a/checkpoint-30/scheduler.pt +++ b/checkpoint-30/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14d970dabadfb95eaf7812b80cb7816a58d7911bb09df450b100b1c052b74a02 +oid sha256:78882a59797d983394328068e814e7aad08e194b72ebc7003618cfb9ff129ecf size 1064 diff --git a/checkpoint-30/trainer_state.json b/checkpoint-30/trainer_state.json index 3d77f2dc0ab9fea959fe76ba758b7780fdd26130..ef7de086cdd9e61c88f0a0e840248fb2b20bd4f5 100644 --- a/checkpoint-30/trainer_state.json +++ b/checkpoint-30/trainer_state.json @@ -1,5 +1,5 @@ { - "best_metric": 1.4295110702514648, + "best_metric": 1.4144253730773926, "best_model_checkpoint": "/kaggle/working/checkpoint-30", "epoch": 3.3333333333333335, "eval_steps": 10, @@ -10,54 +10,54 @@ "log_history": [ { "epoch": 1.1111111111111112, - "grad_norm": 0.02217627689242363, - "learning_rate": 0.00017777777777777779, - "loss": 2.0442, + "grad_norm": 0.022457197308540344, + "learning_rate": 0.0001925925925925926, + "loss": 2.0406, "step": 10 }, { "epoch": 1.1111111111111112, - "eval_loss": 1.737181544303894, - "eval_runtime": 35.1318, - "eval_samples_per_second": 1.025, - "eval_steps_per_second": 0.142, + "eval_loss": 1.729261875152588, + "eval_runtime": 34.8953, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 10 }, { "epoch": 2.2222222222222223, - "grad_norm": 0.0346713550388813, - "learning_rate": 0.00015555555555555556, - "loss": 1.6131, + "grad_norm": 0.018787898123264313, + "learning_rate": 0.0001851851851851852, + "loss": 1.6016, "step": 20 }, { "epoch": 2.2222222222222223, - "eval_loss": 1.5489343404769897, - "eval_runtime": 34.8402, - "eval_samples_per_second": 1.033, - "eval_steps_per_second": 0.144, + "eval_loss": 1.5362553596496582, + "eval_runtime": 34.8752, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 20 }, { "epoch": 3.3333333333333335, - "grad_norm": 0.02501535415649414, - "learning_rate": 0.00013333333333333334, - "loss": 1.4152, + "grad_norm": 0.021070128306746483, + "learning_rate": 0.00017777777777777779, + "loss": 1.3937, "step": 30 }, { "epoch": 3.3333333333333335, - "eval_loss": 1.4295110702514648, - "eval_runtime": 34.8537, - "eval_samples_per_second": 1.033, + "eval_loss": 1.4144253730773926, + "eval_runtime": 34.9429, + "eval_samples_per_second": 1.03, "eval_steps_per_second": 0.143, "step": 30 } ], "logging_steps": 10, - "max_steps": 90, + "max_steps": 270, "num_input_tokens_seen": 0, - "num_train_epochs": 10, + "num_train_epochs": 30, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { diff --git a/checkpoint-30/training_args.bin b/checkpoint-30/training_args.bin index 992e364d4b54f32a399ec3cd5f5f54c212ea0588..89187d2cbb60b941ac70a59387377bf262ba730c 100644 --- a/checkpoint-30/training_args.bin +++ b/checkpoint-30/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f0053334cab1bfd2838c507b240d05093bc205b1d5fbe6dc54a022fef5dcaa7 +oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e size 5112 diff --git a/checkpoint-40/adapter_config.json b/checkpoint-40/adapter_config.json index 539f4c41b2550fc30b9c2d0726f51adfa8e4b1e5..072f1947edd0ba30c6174f22b28e07cf76e90ee3 100644 --- a/checkpoint-40/adapter_config.json +++ b/checkpoint-40/adapter_config.json @@ -20,10 +20,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "k_proj", "q_proj", - "o_proj", - "v_proj" + "v_proj", + "k_proj", + "o_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/checkpoint-40/adapter_model.safetensors b/checkpoint-40/adapter_model.safetensors index 0e9c44306157ba76d1b0525594c780969a0a3f98..33c4b6ca5dc2bcc844033c0a5525ca7676f39f45 100644 --- a/checkpoint-40/adapter_model.safetensors +++ b/checkpoint-40/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab38361a67b61947cafd5230ca79626082a1d26b72f5440faf199b3216bc6704 +oid sha256:7c9e0b673563a407b5292005048d7f9e55e28f761356c6b7d865a6f14dbd4d1f size 67143296 diff --git a/checkpoint-40/optimizer.pt b/checkpoint-40/optimizer.pt index a7c2c9e810a2f0a8093fc2aa51e73b47a0a9abcd..ba00192605278692ee89995d9f2f4a2c729e45c3 100644 --- a/checkpoint-40/optimizer.pt +++ b/checkpoint-40/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a38dd3cb56490e5a9b4d6a05ea97f3a761cd71841c3d9f7f129c1e4c0b4730f +oid sha256:7af07c81b6d2394145239563c1fcabf8a96f4c073bcbe06adbe0e38de3e745d4 size 134433530 diff --git a/checkpoint-40/scheduler.pt b/checkpoint-40/scheduler.pt index 43ed4ecc37d6e4e0738d211af3327e4424a756fc..12d828f33e6644a4712f57e43bb019f8f3cd67a0 100644 --- a/checkpoint-40/scheduler.pt +++ b/checkpoint-40/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc5423f1af1182c2163f569e8f44b9ee18e1849c11acaaa76a185745ad274c02 +oid sha256:441aaa824ef89ae7e5156933dc6dbe413f7c295a974e1ca6e7641ce94bd233fa size 1064 diff --git a/checkpoint-40/trainer_state.json b/checkpoint-40/trainer_state.json index c9bd3601e7274d6268976282b0de89dda2b0b807..ce3dac106926a6abc0978a769605721b01f3385a 100644 --- a/checkpoint-40/trainer_state.json +++ b/checkpoint-40/trainer_state.json @@ -1,5 +1,5 @@ { - "best_metric": 1.3598744869232178, + "best_metric": 1.3360365629196167, "best_model_checkpoint": "/kaggle/working/checkpoint-40", "epoch": 4.444444444444445, "eval_steps": 10, @@ -10,69 +10,69 @@ "log_history": [ { "epoch": 1.1111111111111112, - "grad_norm": 0.02217627689242363, - "learning_rate": 0.00017777777777777779, - "loss": 2.0442, + "grad_norm": 0.022457197308540344, + "learning_rate": 0.0001925925925925926, + "loss": 2.0406, "step": 10 }, { "epoch": 1.1111111111111112, - "eval_loss": 1.737181544303894, - "eval_runtime": 35.1318, - "eval_samples_per_second": 1.025, - "eval_steps_per_second": 0.142, + "eval_loss": 1.729261875152588, + "eval_runtime": 34.8953, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 10 }, { "epoch": 2.2222222222222223, - "grad_norm": 0.0346713550388813, - "learning_rate": 0.00015555555555555556, - "loss": 1.6131, + "grad_norm": 0.018787898123264313, + "learning_rate": 0.0001851851851851852, + "loss": 1.6016, "step": 20 }, { "epoch": 2.2222222222222223, - "eval_loss": 1.5489343404769897, - "eval_runtime": 34.8402, - "eval_samples_per_second": 1.033, - "eval_steps_per_second": 0.144, + "eval_loss": 1.5362553596496582, + "eval_runtime": 34.8752, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 20 }, { "epoch": 3.3333333333333335, - "grad_norm": 0.02501535415649414, - "learning_rate": 0.00013333333333333334, - "loss": 1.4152, + "grad_norm": 0.021070128306746483, + "learning_rate": 0.00017777777777777779, + "loss": 1.3937, "step": 30 }, { "epoch": 3.3333333333333335, - "eval_loss": 1.4295110702514648, - "eval_runtime": 34.8537, - "eval_samples_per_second": 1.033, + "eval_loss": 1.4144253730773926, + "eval_runtime": 34.9429, + "eval_samples_per_second": 1.03, "eval_steps_per_second": 0.143, "step": 30 }, { "epoch": 4.444444444444445, - "grad_norm": 0.02104916237294674, - "learning_rate": 0.00011111111111111112, - "loss": 1.3068, + "grad_norm": 0.037991978228092194, + "learning_rate": 0.00017037037037037037, + "loss": 1.2721, "step": 40 }, { "epoch": 4.444444444444445, - "eval_loss": 1.3598744869232178, - "eval_runtime": 35.0281, - "eval_samples_per_second": 1.028, + "eval_loss": 1.3360365629196167, + "eval_runtime": 34.8947, + "eval_samples_per_second": 1.032, "eval_steps_per_second": 0.143, "step": 40 } ], "logging_steps": 10, - "max_steps": 90, + "max_steps": 270, "num_input_tokens_seen": 0, - "num_train_epochs": 10, + "num_train_epochs": 30, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { diff --git a/checkpoint-40/training_args.bin b/checkpoint-40/training_args.bin index 992e364d4b54f32a399ec3cd5f5f54c212ea0588..89187d2cbb60b941ac70a59387377bf262ba730c 100644 --- a/checkpoint-40/training_args.bin +++ b/checkpoint-40/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f0053334cab1bfd2838c507b240d05093bc205b1d5fbe6dc54a022fef5dcaa7 +oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e size 5112 diff --git a/checkpoint-50/adapter_config.json b/checkpoint-50/adapter_config.json index 539f4c41b2550fc30b9c2d0726f51adfa8e4b1e5..072f1947edd0ba30c6174f22b28e07cf76e90ee3 100644 --- a/checkpoint-50/adapter_config.json +++ b/checkpoint-50/adapter_config.json @@ -20,10 +20,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "k_proj", "q_proj", - "o_proj", - "v_proj" + "v_proj", + "k_proj", + "o_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/checkpoint-50/adapter_model.safetensors b/checkpoint-50/adapter_model.safetensors index c090dac91ca150e0ea0a5153fcd6d57765075303..448b6afba60ca21d47616f70707def8ebc8691aa 100644 --- a/checkpoint-50/adapter_model.safetensors +++ b/checkpoint-50/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b692c5f45a06d84947aef0a222d424aecd480e40aabcd9ca87aa5d3007aa46e8 +oid sha256:ff24760e35eba65f38905cbc2d2b23ce73feb2da6a3bb98fa083cbd3cc564571 size 67143296 diff --git a/checkpoint-50/optimizer.pt b/checkpoint-50/optimizer.pt index a30c9c35dca58e15eaf0c8940bf1926a9cac9d4d..3b7e3026e942280d369b71c2147c37f4f8fa7095 100644 --- a/checkpoint-50/optimizer.pt +++ b/checkpoint-50/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44ab022dad22b0f149a3b1fb04e9cd79842aad48780ac055c542631a6fc57822 +oid sha256:98ab5b6b7c2997d594b9c7e48d2cd958c58dcdf4eaf60e9a5fb6497764869314 size 134433530 diff --git a/checkpoint-50/scheduler.pt b/checkpoint-50/scheduler.pt index 5183cac3f3e1237799f57507b58ef408fc83ade4..50f4a86a43241a0d9a9878a24cb2310e92f09b2e 100644 --- a/checkpoint-50/scheduler.pt +++ b/checkpoint-50/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9425a09cb4fd41e0b7c88529bcda485c5bb777b677ec7982ea20ad9edbd69fc +oid sha256:e372f1180c8c47cb336f1f5f03bef4a1419df4668abc5f3dccb4cafe80635a5a size 1064 diff --git a/checkpoint-50/trainer_state.json b/checkpoint-50/trainer_state.json index 9e26cad60437a7a2d14835768354e565267f6744..e73eff9076191a26d04f015d9fab34ed8e93ae16 100644 --- a/checkpoint-50/trainer_state.json +++ b/checkpoint-50/trainer_state.json @@ -1,5 +1,5 @@ { - "best_metric": 1.3168741464614868, + "best_metric": 1.2785382270812988, "best_model_checkpoint": "/kaggle/working/checkpoint-50", "epoch": 5.555555555555555, "eval_steps": 10, @@ -10,84 +10,84 @@ "log_history": [ { "epoch": 1.1111111111111112, - "grad_norm": 0.02217627689242363, - "learning_rate": 0.00017777777777777779, - "loss": 2.0442, + "grad_norm": 0.022457197308540344, + "learning_rate": 0.0001925925925925926, + "loss": 2.0406, "step": 10 }, { "epoch": 1.1111111111111112, - "eval_loss": 1.737181544303894, - "eval_runtime": 35.1318, - "eval_samples_per_second": 1.025, - "eval_steps_per_second": 0.142, + "eval_loss": 1.729261875152588, + "eval_runtime": 34.8953, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 10 }, { "epoch": 2.2222222222222223, - "grad_norm": 0.0346713550388813, - "learning_rate": 0.00015555555555555556, - "loss": 1.6131, + "grad_norm": 0.018787898123264313, + "learning_rate": 0.0001851851851851852, + "loss": 1.6016, "step": 20 }, { "epoch": 2.2222222222222223, - "eval_loss": 1.5489343404769897, - "eval_runtime": 34.8402, - "eval_samples_per_second": 1.033, - "eval_steps_per_second": 0.144, + "eval_loss": 1.5362553596496582, + "eval_runtime": 34.8752, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 20 }, { "epoch": 3.3333333333333335, - "grad_norm": 0.02501535415649414, - "learning_rate": 0.00013333333333333334, - "loss": 1.4152, + "grad_norm": 0.021070128306746483, + "learning_rate": 0.00017777777777777779, + "loss": 1.3937, "step": 30 }, { "epoch": 3.3333333333333335, - "eval_loss": 1.4295110702514648, - "eval_runtime": 34.8537, - "eval_samples_per_second": 1.033, + "eval_loss": 1.4144253730773926, + "eval_runtime": 34.9429, + "eval_samples_per_second": 1.03, "eval_steps_per_second": 0.143, "step": 30 }, { "epoch": 4.444444444444445, - "grad_norm": 0.02104916237294674, - "learning_rate": 0.00011111111111111112, - "loss": 1.3068, + "grad_norm": 0.037991978228092194, + "learning_rate": 0.00017037037037037037, + "loss": 1.2721, "step": 40 }, { "epoch": 4.444444444444445, - "eval_loss": 1.3598744869232178, - "eval_runtime": 35.0281, - "eval_samples_per_second": 1.028, + "eval_loss": 1.3360365629196167, + "eval_runtime": 34.8947, + "eval_samples_per_second": 1.032, "eval_steps_per_second": 0.143, "step": 40 }, { "epoch": 5.555555555555555, - "grad_norm": 0.022395364940166473, - "learning_rate": 8.888888888888889e-05, - "loss": 1.2049, + "grad_norm": 0.029117526486516, + "learning_rate": 0.00016296296296296295, + "loss": 1.1384, "step": 50 }, { "epoch": 5.555555555555555, - "eval_loss": 1.3168741464614868, - "eval_runtime": 34.692, - "eval_samples_per_second": 1.038, - "eval_steps_per_second": 0.144, + "eval_loss": 1.2785382270812988, + "eval_runtime": 34.8447, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 50 } ], "logging_steps": 10, - "max_steps": 90, + "max_steps": 270, "num_input_tokens_seen": 0, - "num_train_epochs": 10, + "num_train_epochs": 30, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { diff --git a/checkpoint-50/training_args.bin b/checkpoint-50/training_args.bin index 992e364d4b54f32a399ec3cd5f5f54c212ea0588..89187d2cbb60b941ac70a59387377bf262ba730c 100644 --- a/checkpoint-50/training_args.bin +++ b/checkpoint-50/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f0053334cab1bfd2838c507b240d05093bc205b1d5fbe6dc54a022fef5dcaa7 +oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e size 5112 diff --git a/checkpoint-60/adapter_config.json b/checkpoint-60/adapter_config.json index 539f4c41b2550fc30b9c2d0726f51adfa8e4b1e5..072f1947edd0ba30c6174f22b28e07cf76e90ee3 100644 --- a/checkpoint-60/adapter_config.json +++ b/checkpoint-60/adapter_config.json @@ -20,10 +20,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "k_proj", "q_proj", - "o_proj", - "v_proj" + "v_proj", + "k_proj", + "o_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/checkpoint-60/adapter_model.safetensors b/checkpoint-60/adapter_model.safetensors index 1f8e1524c1f516003349c944aa0c31a98ac40f6a..50fb22f91ff4ed487f532fd0b1430a288cd49bce 100644 --- a/checkpoint-60/adapter_model.safetensors +++ b/checkpoint-60/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14ebbefdf71daa025996a412ce6c4f2fd2d5bbf084a4ee0f0ca1dc123cbb85e5 +oid sha256:93a1c19af57eef492b34267eb8c33f124ea9c2a26163633173b860712d691138 size 67143296 diff --git a/checkpoint-60/optimizer.pt b/checkpoint-60/optimizer.pt index d69bad85fc71c2ca5cb24f868de1721993458dde..416e78c1077de234030692cd2f40bab3b1f1a286 100644 --- a/checkpoint-60/optimizer.pt +++ b/checkpoint-60/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5863ac3d6f865bddc72753e9a6db83e90985a3348345c91097785f539b2d743e +oid sha256:595a1cb0a5382a3ac0eca7069089b278627d114cd482c7713528e5381bfd2c78 size 134433530 diff --git a/checkpoint-60/scheduler.pt b/checkpoint-60/scheduler.pt index cc6a804c402fc13ca8d00ce3c4186576d888c22b..c7ed223bce70dd59991bff0a12dcf65538eebbe6 100644 --- a/checkpoint-60/scheduler.pt +++ b/checkpoint-60/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6dfdd6ca5178c66b713159a2bfe5731fea568ef91adf9d3f8039a74c6ff0f6b +oid sha256:efb834ab29ac07d8830bc37519a269af8dbf2a694f279b059b857c6b254f0797 size 1064 diff --git a/checkpoint-60/trainer_state.json b/checkpoint-60/trainer_state.json index d67f979705af37a8eb8025b225e52fc1fa9c702d..74e2150ab896ffd89f4b51bb1b09a3e13861745a 100644 --- a/checkpoint-60/trainer_state.json +++ b/checkpoint-60/trainer_state.json @@ -1,5 +1,5 @@ { - "best_metric": 1.2939578294754028, + "best_metric": 1.2417998313903809, "best_model_checkpoint": "/kaggle/working/checkpoint-60", "epoch": 6.666666666666667, "eval_steps": 10, @@ -10,99 +10,99 @@ "log_history": [ { "epoch": 1.1111111111111112, - "grad_norm": 0.02217627689242363, - "learning_rate": 0.00017777777777777779, - "loss": 2.0442, + "grad_norm": 0.022457197308540344, + "learning_rate": 0.0001925925925925926, + "loss": 2.0406, "step": 10 }, { "epoch": 1.1111111111111112, - "eval_loss": 1.737181544303894, - "eval_runtime": 35.1318, - "eval_samples_per_second": 1.025, - "eval_steps_per_second": 0.142, + "eval_loss": 1.729261875152588, + "eval_runtime": 34.8953, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 10 }, { "epoch": 2.2222222222222223, - "grad_norm": 0.0346713550388813, - "learning_rate": 0.00015555555555555556, - "loss": 1.6131, + "grad_norm": 0.018787898123264313, + "learning_rate": 0.0001851851851851852, + "loss": 1.6016, "step": 20 }, { "epoch": 2.2222222222222223, - "eval_loss": 1.5489343404769897, - "eval_runtime": 34.8402, - "eval_samples_per_second": 1.033, - "eval_steps_per_second": 0.144, + "eval_loss": 1.5362553596496582, + "eval_runtime": 34.8752, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 20 }, { "epoch": 3.3333333333333335, - "grad_norm": 0.02501535415649414, - "learning_rate": 0.00013333333333333334, - "loss": 1.4152, + "grad_norm": 0.021070128306746483, + "learning_rate": 0.00017777777777777779, + "loss": 1.3937, "step": 30 }, { "epoch": 3.3333333333333335, - "eval_loss": 1.4295110702514648, - "eval_runtime": 34.8537, - "eval_samples_per_second": 1.033, + "eval_loss": 1.4144253730773926, + "eval_runtime": 34.9429, + "eval_samples_per_second": 1.03, "eval_steps_per_second": 0.143, "step": 30 }, { "epoch": 4.444444444444445, - "grad_norm": 0.02104916237294674, - "learning_rate": 0.00011111111111111112, - "loss": 1.3068, + "grad_norm": 0.037991978228092194, + "learning_rate": 0.00017037037037037037, + "loss": 1.2721, "step": 40 }, { "epoch": 4.444444444444445, - "eval_loss": 1.3598744869232178, - "eval_runtime": 35.0281, - "eval_samples_per_second": 1.028, + "eval_loss": 1.3360365629196167, + "eval_runtime": 34.8947, + "eval_samples_per_second": 1.032, "eval_steps_per_second": 0.143, "step": 40 }, { "epoch": 5.555555555555555, - "grad_norm": 0.022395364940166473, - "learning_rate": 8.888888888888889e-05, - "loss": 1.2049, + "grad_norm": 0.029117526486516, + "learning_rate": 0.00016296296296296295, + "loss": 1.1384, "step": 50 }, { "epoch": 5.555555555555555, - "eval_loss": 1.3168741464614868, - "eval_runtime": 34.692, - "eval_samples_per_second": 1.038, - "eval_steps_per_second": 0.144, + "eval_loss": 1.2785382270812988, + "eval_runtime": 34.8447, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 50 }, { "epoch": 6.666666666666667, - "grad_norm": 0.02603345364332199, - "learning_rate": 6.666666666666667e-05, - "loss": 1.1086, + "grad_norm": 0.0317281112074852, + "learning_rate": 0.00015555555555555556, + "loss": 1.0023, "step": 60 }, { "epoch": 6.666666666666667, - "eval_loss": 1.2939578294754028, - "eval_runtime": 34.6444, - "eval_samples_per_second": 1.039, + "eval_loss": 1.2417998313903809, + "eval_runtime": 34.8141, + "eval_samples_per_second": 1.034, "eval_steps_per_second": 0.144, "step": 60 } ], "logging_steps": 10, - "max_steps": 90, + "max_steps": 270, "num_input_tokens_seen": 0, - "num_train_epochs": 10, + "num_train_epochs": 30, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { diff --git a/checkpoint-60/training_args.bin b/checkpoint-60/training_args.bin index 992e364d4b54f32a399ec3cd5f5f54c212ea0588..89187d2cbb60b941ac70a59387377bf262ba730c 100644 --- a/checkpoint-60/training_args.bin +++ b/checkpoint-60/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f0053334cab1bfd2838c507b240d05093bc205b1d5fbe6dc54a022fef5dcaa7 +oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e size 5112 diff --git a/checkpoint-70/adapter_config.json b/checkpoint-70/adapter_config.json index 539f4c41b2550fc30b9c2d0726f51adfa8e4b1e5..072f1947edd0ba30c6174f22b28e07cf76e90ee3 100644 --- a/checkpoint-70/adapter_config.json +++ b/checkpoint-70/adapter_config.json @@ -20,10 +20,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "k_proj", "q_proj", - "o_proj", - "v_proj" + "v_proj", + "k_proj", + "o_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/checkpoint-70/adapter_model.safetensors b/checkpoint-70/adapter_model.safetensors index d37ae25ce097d1ffcfe43a98645a9931dd503a17..1061c3c0988731331882f3cd80be06b128cf74e4 100644 --- a/checkpoint-70/adapter_model.safetensors +++ b/checkpoint-70/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c44abb9173442b0bc413ba469da95e7be794812df0c2a2c16e54fc301511e3e +oid sha256:0f814556ebc75301eaddd2092d35871f744bed3ab761712f4883f0792eee5935 size 67143296 diff --git a/checkpoint-70/optimizer.pt b/checkpoint-70/optimizer.pt index bffa2b54a8c022331899d9d0ede4fa6b5c991b85..0868732120175e0ad5fa1b6e65cc36505abacaa1 100644 --- a/checkpoint-70/optimizer.pt +++ b/checkpoint-70/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2df92dd99064eb6aa6542b203055cf8cc892cff2f30a210807b7667c96cedc3a +oid sha256:87d49ff4992af82866af04db243b9d82b337a60bba420ea4add5785c6f38378f size 134433530 diff --git a/checkpoint-70/scheduler.pt b/checkpoint-70/scheduler.pt index c0754c9b1731ec8dc0ccf94773cb0b29b5676b39..9954f23372e6aab570d96e7f4ce50ffad7133ba1 100644 --- a/checkpoint-70/scheduler.pt +++ b/checkpoint-70/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b55d3cbe104822729f4f45e02a6c248fb8a4cb356c229f5c93e65066ff6a397 +oid sha256:5642c1fb4ba74d791ff94b2001be8bc432c3364fecb8b6c72f5aaf5b968db280 size 1064 diff --git a/checkpoint-70/trainer_state.json b/checkpoint-70/trainer_state.json index f237dd39f79c989b86e5ff97969592a63cb20742..f8e532d600d90ae657b40e0fbc11abb30b195345 100644 --- a/checkpoint-70/trainer_state.json +++ b/checkpoint-70/trainer_state.json @@ -1,5 +1,5 @@ { - "best_metric": 1.2787070274353027, + "best_metric": 1.2166908979415894, "best_model_checkpoint": "/kaggle/working/checkpoint-70", "epoch": 7.777777777777778, "eval_steps": 10, @@ -10,114 +10,114 @@ "log_history": [ { "epoch": 1.1111111111111112, - "grad_norm": 0.02217627689242363, - "learning_rate": 0.00017777777777777779, - "loss": 2.0442, + "grad_norm": 0.022457197308540344, + "learning_rate": 0.0001925925925925926, + "loss": 2.0406, "step": 10 }, { "epoch": 1.1111111111111112, - "eval_loss": 1.737181544303894, - "eval_runtime": 35.1318, - "eval_samples_per_second": 1.025, - "eval_steps_per_second": 0.142, + "eval_loss": 1.729261875152588, + "eval_runtime": 34.8953, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 10 }, { "epoch": 2.2222222222222223, - "grad_norm": 0.0346713550388813, - "learning_rate": 0.00015555555555555556, - "loss": 1.6131, + "grad_norm": 0.018787898123264313, + "learning_rate": 0.0001851851851851852, + "loss": 1.6016, "step": 20 }, { "epoch": 2.2222222222222223, - "eval_loss": 1.5489343404769897, - "eval_runtime": 34.8402, - "eval_samples_per_second": 1.033, - "eval_steps_per_second": 0.144, + "eval_loss": 1.5362553596496582, + "eval_runtime": 34.8752, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 20 }, { "epoch": 3.3333333333333335, - "grad_norm": 0.02501535415649414, - "learning_rate": 0.00013333333333333334, - "loss": 1.4152, + "grad_norm": 0.021070128306746483, + "learning_rate": 0.00017777777777777779, + "loss": 1.3937, "step": 30 }, { "epoch": 3.3333333333333335, - "eval_loss": 1.4295110702514648, - "eval_runtime": 34.8537, - "eval_samples_per_second": 1.033, + "eval_loss": 1.4144253730773926, + "eval_runtime": 34.9429, + "eval_samples_per_second": 1.03, "eval_steps_per_second": 0.143, "step": 30 }, { "epoch": 4.444444444444445, - "grad_norm": 0.02104916237294674, - "learning_rate": 0.00011111111111111112, - "loss": 1.3068, + "grad_norm": 0.037991978228092194, + "learning_rate": 0.00017037037037037037, + "loss": 1.2721, "step": 40 }, { "epoch": 4.444444444444445, - "eval_loss": 1.3598744869232178, - "eval_runtime": 35.0281, - "eval_samples_per_second": 1.028, + "eval_loss": 1.3360365629196167, + "eval_runtime": 34.8947, + "eval_samples_per_second": 1.032, "eval_steps_per_second": 0.143, "step": 40 }, { "epoch": 5.555555555555555, - "grad_norm": 0.022395364940166473, - "learning_rate": 8.888888888888889e-05, - "loss": 1.2049, + "grad_norm": 0.029117526486516, + "learning_rate": 0.00016296296296296295, + "loss": 1.1384, "step": 50 }, { "epoch": 5.555555555555555, - "eval_loss": 1.3168741464614868, - "eval_runtime": 34.692, - "eval_samples_per_second": 1.038, - "eval_steps_per_second": 0.144, + "eval_loss": 1.2785382270812988, + "eval_runtime": 34.8447, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 50 }, { "epoch": 6.666666666666667, - "grad_norm": 0.02603345364332199, - "learning_rate": 6.666666666666667e-05, - "loss": 1.1086, + "grad_norm": 0.0317281112074852, + "learning_rate": 0.00015555555555555556, + "loss": 1.0023, "step": 60 }, { "epoch": 6.666666666666667, - "eval_loss": 1.2939578294754028, - "eval_runtime": 34.6444, - "eval_samples_per_second": 1.039, + "eval_loss": 1.2417998313903809, + "eval_runtime": 34.8141, + "eval_samples_per_second": 1.034, "eval_steps_per_second": 0.144, "step": 60 }, { "epoch": 7.777777777777778, - "grad_norm": 0.02798735536634922, - "learning_rate": 4.4444444444444447e-05, - "loss": 1.0716, + "grad_norm": 0.034914035350084305, + "learning_rate": 0.00014814814814814815, + "loss": 0.9166, "step": 70 }, { "epoch": 7.777777777777778, - "eval_loss": 1.2787070274353027, - "eval_runtime": 34.7046, - "eval_samples_per_second": 1.037, - "eval_steps_per_second": 0.144, + "eval_loss": 1.2166908979415894, + "eval_runtime": 34.8956, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 70 } ], "logging_steps": 10, - "max_steps": 90, + "max_steps": 270, "num_input_tokens_seen": 0, - "num_train_epochs": 10, + "num_train_epochs": 30, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { diff --git a/checkpoint-70/training_args.bin b/checkpoint-70/training_args.bin index 992e364d4b54f32a399ec3cd5f5f54c212ea0588..89187d2cbb60b941ac70a59387377bf262ba730c 100644 --- a/checkpoint-70/training_args.bin +++ b/checkpoint-70/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f0053334cab1bfd2838c507b240d05093bc205b1d5fbe6dc54a022fef5dcaa7 +oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e size 5112 diff --git a/checkpoint-80/adapter_config.json b/checkpoint-80/adapter_config.json index 539f4c41b2550fc30b9c2d0726f51adfa8e4b1e5..072f1947edd0ba30c6174f22b28e07cf76e90ee3 100644 --- a/checkpoint-80/adapter_config.json +++ b/checkpoint-80/adapter_config.json @@ -20,10 +20,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "k_proj", "q_proj", - "o_proj", - "v_proj" + "v_proj", + "k_proj", + "o_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/checkpoint-80/adapter_model.safetensors b/checkpoint-80/adapter_model.safetensors index da6da76e3af83770baf51b8d9c83a5a2b88ccd17..7ff1fc7bb22e1b7f9dd7397d0137d12f68bc0375 100644 --- a/checkpoint-80/adapter_model.safetensors +++ b/checkpoint-80/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee04952462bb07ccc849271f7f78d75c5f8afc6c7ff3a1fa361acabf44fc0ed6 +oid sha256:261564681985d0071d030855db7eab7adfe92461f0c742b5d7539df99d32c2c9 size 67143296 diff --git a/checkpoint-80/optimizer.pt b/checkpoint-80/optimizer.pt index 51b090c22dc8b998c0fedccdeba50c58dbd0edb2..6934e15a29eb0213cf7a8a938346f2640e9c55eb 100644 --- a/checkpoint-80/optimizer.pt +++ b/checkpoint-80/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:45c3bb44731e3776f6a34962f3d7fff840e05a3c8f5d4ae210cd851b70ac0818 +oid sha256:a77aab3f84590bcd97f7068ba4f39435ade66ee723022f8470007e88d9615282 size 134433530 diff --git a/checkpoint-80/scheduler.pt b/checkpoint-80/scheduler.pt index c1333b2d0a9597978117f1289678ae5312068439..056109a1eae85d1c37c892c90da0c6a3b33d0ef4 100644 --- a/checkpoint-80/scheduler.pt +++ b/checkpoint-80/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41273c1b8d35bfdc89d8195f07d765f03030886c79bd8a46673f085df81965d2 +oid sha256:be0711c3ad6f43ce666c00ca716d0f2b5c8f199ef8a053f1cd2f8aeb8290a62d size 1064 diff --git a/checkpoint-80/trainer_state.json b/checkpoint-80/trainer_state.json index 1e7673eb76c2be46da58c796dc4c4042fc22719c..04d0865c55cbe806c9198d26fc06a7bba062d57c 100644 --- a/checkpoint-80/trainer_state.json +++ b/checkpoint-80/trainer_state.json @@ -1,5 +1,5 @@ { - "best_metric": 1.2677136659622192, + "best_metric": 1.19890296459198, "best_model_checkpoint": "/kaggle/working/checkpoint-80", "epoch": 8.88888888888889, "eval_steps": 10, @@ -10,129 +10,129 @@ "log_history": [ { "epoch": 1.1111111111111112, - "grad_norm": 0.02217627689242363, - "learning_rate": 0.00017777777777777779, - "loss": 2.0442, + "grad_norm": 0.022457197308540344, + "learning_rate": 0.0001925925925925926, + "loss": 2.0406, "step": 10 }, { "epoch": 1.1111111111111112, - "eval_loss": 1.737181544303894, - "eval_runtime": 35.1318, - "eval_samples_per_second": 1.025, - "eval_steps_per_second": 0.142, + "eval_loss": 1.729261875152588, + "eval_runtime": 34.8953, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 10 }, { "epoch": 2.2222222222222223, - "grad_norm": 0.0346713550388813, - "learning_rate": 0.00015555555555555556, - "loss": 1.6131, + "grad_norm": 0.018787898123264313, + "learning_rate": 0.0001851851851851852, + "loss": 1.6016, "step": 20 }, { "epoch": 2.2222222222222223, - "eval_loss": 1.5489343404769897, - "eval_runtime": 34.8402, - "eval_samples_per_second": 1.033, - "eval_steps_per_second": 0.144, + "eval_loss": 1.5362553596496582, + "eval_runtime": 34.8752, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 20 }, { "epoch": 3.3333333333333335, - "grad_norm": 0.02501535415649414, - "learning_rate": 0.00013333333333333334, - "loss": 1.4152, + "grad_norm": 0.021070128306746483, + "learning_rate": 0.00017777777777777779, + "loss": 1.3937, "step": 30 }, { "epoch": 3.3333333333333335, - "eval_loss": 1.4295110702514648, - "eval_runtime": 34.8537, - "eval_samples_per_second": 1.033, + "eval_loss": 1.4144253730773926, + "eval_runtime": 34.9429, + "eval_samples_per_second": 1.03, "eval_steps_per_second": 0.143, "step": 30 }, { "epoch": 4.444444444444445, - "grad_norm": 0.02104916237294674, - "learning_rate": 0.00011111111111111112, - "loss": 1.3068, + "grad_norm": 0.037991978228092194, + "learning_rate": 0.00017037037037037037, + "loss": 1.2721, "step": 40 }, { "epoch": 4.444444444444445, - "eval_loss": 1.3598744869232178, - "eval_runtime": 35.0281, - "eval_samples_per_second": 1.028, + "eval_loss": 1.3360365629196167, + "eval_runtime": 34.8947, + "eval_samples_per_second": 1.032, "eval_steps_per_second": 0.143, "step": 40 }, { "epoch": 5.555555555555555, - "grad_norm": 0.022395364940166473, - "learning_rate": 8.888888888888889e-05, - "loss": 1.2049, + "grad_norm": 0.029117526486516, + "learning_rate": 0.00016296296296296295, + "loss": 1.1384, "step": 50 }, { "epoch": 5.555555555555555, - "eval_loss": 1.3168741464614868, - "eval_runtime": 34.692, - "eval_samples_per_second": 1.038, - "eval_steps_per_second": 0.144, + "eval_loss": 1.2785382270812988, + "eval_runtime": 34.8447, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 50 }, { "epoch": 6.666666666666667, - "grad_norm": 0.02603345364332199, - "learning_rate": 6.666666666666667e-05, - "loss": 1.1086, + "grad_norm": 0.0317281112074852, + "learning_rate": 0.00015555555555555556, + "loss": 1.0023, "step": 60 }, { "epoch": 6.666666666666667, - "eval_loss": 1.2939578294754028, - "eval_runtime": 34.6444, - "eval_samples_per_second": 1.039, + "eval_loss": 1.2417998313903809, + "eval_runtime": 34.8141, + "eval_samples_per_second": 1.034, "eval_steps_per_second": 0.144, "step": 60 }, { "epoch": 7.777777777777778, - "grad_norm": 0.02798735536634922, - "learning_rate": 4.4444444444444447e-05, - "loss": 1.0716, + "grad_norm": 0.034914035350084305, + "learning_rate": 0.00014814814814814815, + "loss": 0.9166, "step": 70 }, { "epoch": 7.777777777777778, - "eval_loss": 1.2787070274353027, - "eval_runtime": 34.7046, - "eval_samples_per_second": 1.037, - "eval_steps_per_second": 0.144, + "eval_loss": 1.2166908979415894, + "eval_runtime": 34.8956, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 70 }, { "epoch": 8.88888888888889, - "grad_norm": 0.028509726747870445, - "learning_rate": 2.2222222222222223e-05, - "loss": 1.0051, + "grad_norm": 0.04872061312198639, + "learning_rate": 0.00014074074074074076, + "loss": 0.7726, "step": 80 }, { "epoch": 8.88888888888889, - "eval_loss": 1.2677136659622192, - "eval_runtime": 34.6744, - "eval_samples_per_second": 1.038, - "eval_steps_per_second": 0.144, + "eval_loss": 1.19890296459198, + "eval_runtime": 34.8433, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 80 } ], "logging_steps": 10, - "max_steps": 90, + "max_steps": 270, "num_input_tokens_seen": 0, - "num_train_epochs": 10, + "num_train_epochs": 30, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { diff --git a/checkpoint-80/training_args.bin b/checkpoint-80/training_args.bin index 992e364d4b54f32a399ec3cd5f5f54c212ea0588..89187d2cbb60b941ac70a59387377bf262ba730c 100644 --- a/checkpoint-80/training_args.bin +++ b/checkpoint-80/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f0053334cab1bfd2838c507b240d05093bc205b1d5fbe6dc54a022fef5dcaa7 +oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e size 5112 diff --git a/checkpoint-90/adapter_config.json b/checkpoint-90/adapter_config.json index 539f4c41b2550fc30b9c2d0726f51adfa8e4b1e5..072f1947edd0ba30c6174f22b28e07cf76e90ee3 100644 --- a/checkpoint-90/adapter_config.json +++ b/checkpoint-90/adapter_config.json @@ -20,10 +20,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "k_proj", "q_proj", - "o_proj", - "v_proj" + "v_proj", + "k_proj", + "o_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/checkpoint-90/adapter_model.safetensors b/checkpoint-90/adapter_model.safetensors index aad60b0963bb0817ddff00ad978492cc2241a8a2..0f832e284c815c65e6a6d1276b6ea980e4569404 100644 --- a/checkpoint-90/adapter_model.safetensors +++ b/checkpoint-90/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bdfd134cf3b5e167c3aa127bf57024a3e8ff71b6b0ea16d5493a51a01d7e317 +oid sha256:f90c1d3ed853f5e7e29d9c0d39bdab0cc26bd4d4ea5fbb602291f4c783b23d04 size 67143296 diff --git a/checkpoint-90/optimizer.pt b/checkpoint-90/optimizer.pt index 4df135bf4da4f6ccc5d4d149b95f13a983128c6c..49f68cb3e33508f0a994b245932120822e57fdfc 100644 --- a/checkpoint-90/optimizer.pt +++ b/checkpoint-90/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0b8a18cc875405991f47cfb39631df339f1779ad23225cf25c95a999ce4fce8 +oid sha256:8255df8e2451dcd95a530557ee89bbd452143665ba3fb88035ddc3a9541b9404 size 134433530 diff --git a/checkpoint-90/scheduler.pt b/checkpoint-90/scheduler.pt index d64adf52b600194652799bb966d26d4c3c3f82a0..9dd29662c639a1b35eb2d4e47613293d7c6ac58c 100644 --- a/checkpoint-90/scheduler.pt +++ b/checkpoint-90/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf6a9a1e39c0655c6941309388d2a028b5b8dbbca031ca2500bdfcdc9f2c01aa +oid sha256:e465517a46a82c4867e6ac365b742df1c7ee4f7bc85ffb822c463fed30cd77ad size 1064 diff --git a/checkpoint-90/trainer_state.json b/checkpoint-90/trainer_state.json index 73dbd767f4448771ba9b48d6966e9f9712de9b40..90862dccc44ec1174f77c99ec0135ff023a0d3e8 100644 --- a/checkpoint-90/trainer_state.json +++ b/checkpoint-90/trainer_state.json @@ -1,5 +1,5 @@ { - "best_metric": 1.2664015293121338, + "best_metric": 1.173593521118164, "best_model_checkpoint": "/kaggle/working/checkpoint-90", "epoch": 10.0, "eval_steps": 10, @@ -10,144 +10,144 @@ "log_history": [ { "epoch": 1.1111111111111112, - "grad_norm": 0.02217627689242363, - "learning_rate": 0.00017777777777777779, - "loss": 2.0442, + "grad_norm": 0.022457197308540344, + "learning_rate": 0.0001925925925925926, + "loss": 2.0406, "step": 10 }, { "epoch": 1.1111111111111112, - "eval_loss": 1.737181544303894, - "eval_runtime": 35.1318, - "eval_samples_per_second": 1.025, - "eval_steps_per_second": 0.142, + "eval_loss": 1.729261875152588, + "eval_runtime": 34.8953, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 10 }, { "epoch": 2.2222222222222223, - "grad_norm": 0.0346713550388813, - "learning_rate": 0.00015555555555555556, - "loss": 1.6131, + "grad_norm": 0.018787898123264313, + "learning_rate": 0.0001851851851851852, + "loss": 1.6016, "step": 20 }, { "epoch": 2.2222222222222223, - "eval_loss": 1.5489343404769897, - "eval_runtime": 34.8402, - "eval_samples_per_second": 1.033, - "eval_steps_per_second": 0.144, + "eval_loss": 1.5362553596496582, + "eval_runtime": 34.8752, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 20 }, { "epoch": 3.3333333333333335, - "grad_norm": 0.02501535415649414, - "learning_rate": 0.00013333333333333334, - "loss": 1.4152, + "grad_norm": 0.021070128306746483, + "learning_rate": 0.00017777777777777779, + "loss": 1.3937, "step": 30 }, { "epoch": 3.3333333333333335, - "eval_loss": 1.4295110702514648, - "eval_runtime": 34.8537, - "eval_samples_per_second": 1.033, + "eval_loss": 1.4144253730773926, + "eval_runtime": 34.9429, + "eval_samples_per_second": 1.03, "eval_steps_per_second": 0.143, "step": 30 }, { "epoch": 4.444444444444445, - "grad_norm": 0.02104916237294674, - "learning_rate": 0.00011111111111111112, - "loss": 1.3068, + "grad_norm": 0.037991978228092194, + "learning_rate": 0.00017037037037037037, + "loss": 1.2721, "step": 40 }, { "epoch": 4.444444444444445, - "eval_loss": 1.3598744869232178, - "eval_runtime": 35.0281, - "eval_samples_per_second": 1.028, + "eval_loss": 1.3360365629196167, + "eval_runtime": 34.8947, + "eval_samples_per_second": 1.032, "eval_steps_per_second": 0.143, "step": 40 }, { "epoch": 5.555555555555555, - "grad_norm": 0.022395364940166473, - "learning_rate": 8.888888888888889e-05, - "loss": 1.2049, + "grad_norm": 0.029117526486516, + "learning_rate": 0.00016296296296296295, + "loss": 1.1384, "step": 50 }, { "epoch": 5.555555555555555, - "eval_loss": 1.3168741464614868, - "eval_runtime": 34.692, - "eval_samples_per_second": 1.038, - "eval_steps_per_second": 0.144, + "eval_loss": 1.2785382270812988, + "eval_runtime": 34.8447, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 50 }, { "epoch": 6.666666666666667, - "grad_norm": 0.02603345364332199, - "learning_rate": 6.666666666666667e-05, - "loss": 1.1086, + "grad_norm": 0.0317281112074852, + "learning_rate": 0.00015555555555555556, + "loss": 1.0023, "step": 60 }, { "epoch": 6.666666666666667, - "eval_loss": 1.2939578294754028, - "eval_runtime": 34.6444, - "eval_samples_per_second": 1.039, + "eval_loss": 1.2417998313903809, + "eval_runtime": 34.8141, + "eval_samples_per_second": 1.034, "eval_steps_per_second": 0.144, "step": 60 }, { "epoch": 7.777777777777778, - "grad_norm": 0.02798735536634922, - "learning_rate": 4.4444444444444447e-05, - "loss": 1.0716, + "grad_norm": 0.034914035350084305, + "learning_rate": 0.00014814814814814815, + "loss": 0.9166, "step": 70 }, { "epoch": 7.777777777777778, - "eval_loss": 1.2787070274353027, - "eval_runtime": 34.7046, - "eval_samples_per_second": 1.037, - "eval_steps_per_second": 0.144, + "eval_loss": 1.2166908979415894, + "eval_runtime": 34.8956, + "eval_samples_per_second": 1.032, + "eval_steps_per_second": 0.143, "step": 70 }, { "epoch": 8.88888888888889, - "grad_norm": 0.028509726747870445, - "learning_rate": 2.2222222222222223e-05, - "loss": 1.0051, + "grad_norm": 0.04872061312198639, + "learning_rate": 0.00014074074074074076, + "loss": 0.7726, "step": 80 }, { "epoch": 8.88888888888889, - "eval_loss": 1.2677136659622192, - "eval_runtime": 34.6744, - "eval_samples_per_second": 1.038, - "eval_steps_per_second": 0.144, + "eval_loss": 1.19890296459198, + "eval_runtime": 34.8433, + "eval_samples_per_second": 1.033, + "eval_steps_per_second": 0.143, "step": 80 }, { "epoch": 10.0, - "grad_norm": 0.03036084771156311, - "learning_rate": 0.0, - "loss": 0.9939, + "grad_norm": 0.04901803284883499, + "learning_rate": 0.00013333333333333334, + "loss": 0.676, "step": 90 }, { "epoch": 10.0, - "eval_loss": 1.2664015293121338, - "eval_runtime": 34.6396, - "eval_samples_per_second": 1.039, + "eval_loss": 1.173593521118164, + "eval_runtime": 34.7999, + "eval_samples_per_second": 1.034, "eval_steps_per_second": 0.144, "step": 90 } ], "logging_steps": 10, - "max_steps": 90, + "max_steps": 270, "num_input_tokens_seen": 0, - "num_train_epochs": 10, + "num_train_epochs": 30, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { @@ -165,7 +165,7 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": true + "should_training_stop": false }, "attributes": {} } diff --git a/checkpoint-90/training_args.bin b/checkpoint-90/training_args.bin index 992e364d4b54f32a399ec3cd5f5f54c212ea0588..89187d2cbb60b941ac70a59387377bf262ba730c 100644 --- a/checkpoint-90/training_args.bin +++ b/checkpoint-90/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f0053334cab1bfd2838c507b240d05093bc205b1d5fbe6dc54a022fef5dcaa7 +oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e size 5112 diff --git a/runs/Aug05_00-02-09_9d6a1d4f6b01/events.out.tfevents.1722816131.9d6a1d4f6b01.34.0 b/runs/Aug05_00-02-09_9d6a1d4f6b01/events.out.tfevents.1722816131.9d6a1d4f6b01.34.0 new file mode 100644 index 0000000000000000000000000000000000000000..cb2348065e0eb9cda6d5bdf1ee33402333d10a0d --- /dev/null +++ b/runs/Aug05_00-02-09_9d6a1d4f6b01/events.out.tfevents.1722816131.9d6a1d4f6b01.34.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8626904e2d7fcb67e33206bbb8b6879b21da3780147ef1bb24a6ae197c72a215 +size 11415