Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

config.json +46 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +1365 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForMaskedLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9b3d304bf2c533b36cdde8c15d832c24dfeeddf7ed2ef833bc27d9c6dcb2e14
+size 598635032

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8d3357a9c520106e98f0827f5164e8281faa21197307d982fd1e29fa5b6defb
+size 1197357242

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b45c0592bc4cca85b65697c5fc3df6315654d61aa7cac3bf6db35c10f3d202c
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91d1eb85152dcf55ea6434f75a55f2dd53ed4d028afbac199cfec81681054f6f
+size 1064

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1365 @@

+{
+  "best_metric": 1.2660380601882935,
+  "best_model_checkpoint": "./bert_mlm_finetuned/checkpoint-900",
+  "epoch": 0.1152,
+  "eval_steps": 100,
+  "global_step": 900,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00064,
+      "grad_norm": 7.650606632232666,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 6.29,
+      "step": 5
+    },
+    {
+      "epoch": 0.00128,
+      "grad_norm": 4.541823387145996,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 6.3815,
+      "step": 10
+    },
+    {
+      "epoch": 0.00192,
+      "grad_norm": 4.245054721832275,
+      "learning_rate": 3e-06,
+      "loss": 6.2854,
+      "step": 15
+    },
+    {
+      "epoch": 0.00256,
+      "grad_norm": 4.5587897300720215,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 6.0674,
+      "step": 20
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 3.7703804969787598,
+      "learning_rate": 4.800000000000001e-06,
+      "loss": 6.2961,
+      "step": 25
+    },
+    {
+      "epoch": 0.00384,
+      "grad_norm": 3.8425862789154053,
+      "learning_rate": 5.8e-06,
+      "loss": 6.3326,
+      "step": 30
+    },
+    {
+      "epoch": 0.00448,
+      "grad_norm": 4.413463115692139,
+      "learning_rate": 6.800000000000001e-06,
+      "loss": 6.183,
+      "step": 35
+    },
+    {
+      "epoch": 0.00512,
+      "grad_norm": 4.1980509757995605,
+      "learning_rate": 7.800000000000002e-06,
+      "loss": 6.2654,
+      "step": 40
+    },
+    {
+      "epoch": 0.00576,
+      "grad_norm": 3.9166719913482666,
+      "learning_rate": 8.8e-06,
+      "loss": 6.0916,
+      "step": 45
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 3.4706904888153076,
+      "learning_rate": 9.800000000000001e-06,
+      "loss": 6.103,
+      "step": 50
+    },
+    {
+      "epoch": 0.00704,
+      "grad_norm": 5.138203144073486,
+      "learning_rate": 9.999998372356185e-06,
+      "loss": 6.2379,
+      "step": 55
+    },
+    {
+      "epoch": 0.00768,
+      "grad_norm": 3.7806520462036133,
+      "learning_rate": 9.999991760055e-06,
+      "loss": 6.1776,
+      "step": 60
+    },
+    {
+      "epoch": 0.00832,
+      "grad_norm": 3.5731871128082275,
+      "learning_rate": 9.999980061375427e-06,
+      "loss": 6.2082,
+      "step": 65
+    },
+    {
+      "epoch": 0.00896,
+      "grad_norm": 3.661797285079956,
+      "learning_rate": 9.999963276329369e-06,
+      "loss": 6.0704,
+      "step": 70
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 3.6181113719940186,
+      "learning_rate": 9.999941404933902e-06,
+      "loss": 6.2081,
+      "step": 75
+    },
+    {
+      "epoch": 0.01024,
+      "grad_norm": 3.3162803649902344,
+      "learning_rate": 9.99991444721127e-06,
+      "loss": 5.8807,
+      "step": 80
+    },
+    {
+      "epoch": 0.01088,
+      "grad_norm": 3.6022472381591797,
+      "learning_rate": 9.999882403188902e-06,
+      "loss": 6.1092,
+      "step": 85
+    },
+    {
+      "epoch": 0.01152,
+      "grad_norm": 7.291418552398682,
+      "learning_rate": 9.999845272899393e-06,
+      "loss": 5.7668,
+      "step": 90
+    },
+    {
+      "epoch": 0.01216,
+      "grad_norm": 3.522437810897827,
+      "learning_rate": 9.999803056380517e-06,
+      "loss": 6.1621,
+      "step": 95
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 3.9014439582824707,
+      "learning_rate": 9.999755753675216e-06,
+      "loss": 6.0573,
+      "step": 100
+    },
+    {
+      "epoch": 0.0128,
+      "eval_loss": 1.5072969198226929,
+      "eval_runtime": 11.1161,
+      "eval_samples_per_second": 89.96,
+      "eval_steps_per_second": 11.245,
+      "step": 100
+    },
+    {
+      "epoch": 0.01344,
+      "grad_norm": 3.7579081058502197,
+      "learning_rate": 9.999703364831614e-06,
+      "loss": 6.1671,
+      "step": 105
+    },
+    {
+      "epoch": 0.01408,
+      "grad_norm": 3.7058262825012207,
+      "learning_rate": 9.999645889903002e-06,
+      "loss": 6.1348,
+      "step": 110
+    },
+    {
+      "epoch": 0.01472,
+      "grad_norm": 5.018667697906494,
+      "learning_rate": 9.99958332894785e-06,
+      "loss": 5.9376,
+      "step": 115
+    },
+    {
+      "epoch": 0.01536,
+      "grad_norm": 3.5420188903808594,
+      "learning_rate": 9.999515682029798e-06,
+      "loss": 5.9961,
+      "step": 120
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 3.5725393295288086,
+      "learning_rate": 9.999442949217663e-06,
+      "loss": 5.8439,
+      "step": 125
+    },
+    {
+      "epoch": 0.01664,
+      "grad_norm": 3.8440959453582764,
+      "learning_rate": 9.999365130585435e-06,
+      "loss": 5.7857,
+      "step": 130
+    },
+    {
+      "epoch": 0.01728,
+      "grad_norm": 3.4371285438537598,
+      "learning_rate": 9.999282226212276e-06,
+      "loss": 5.799,
+      "step": 135
+    },
+    {
+      "epoch": 0.01792,
+      "grad_norm": 3.996847152709961,
+      "learning_rate": 9.999194236182523e-06,
+      "loss": 6.0022,
+      "step": 140
+    },
+    {
+      "epoch": 0.01856,
+      "grad_norm": 3.720330238342285,
+      "learning_rate": 9.999101160585687e-06,
+      "loss": 5.925,
+      "step": 145
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 3.8822953701019287,
+      "learning_rate": 9.99900299951645e-06,
+      "loss": 5.8085,
+      "step": 150
+    },
+    {
+      "epoch": 0.01984,
+      "grad_norm": 3.599283456802368,
+      "learning_rate": 9.99889975307467e-06,
+      "loss": 5.6533,
+      "step": 155
+    },
+    {
+      "epoch": 0.02048,
+      "grad_norm": 3.4847381114959717,
+      "learning_rate": 9.998791421365376e-06,
+      "loss": 5.9021,
+      "step": 160
+    },
+    {
+      "epoch": 0.02112,
+      "grad_norm": 3.4302055835723877,
+      "learning_rate": 9.998678004498774e-06,
+      "loss": 5.962,
+      "step": 165
+    },
+    {
+      "epoch": 0.02176,
+      "grad_norm": 4.561929702758789,
+      "learning_rate": 9.99855950259024e-06,
+      "loss": 5.9011,
+      "step": 170
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 4.069271087646484,
+      "learning_rate": 9.998435915760323e-06,
+      "loss": 5.6782,
+      "step": 175
+    },
+    {
+      "epoch": 0.02304,
+      "grad_norm": 3.5959055423736572,
+      "learning_rate": 9.998307244134741e-06,
+      "loss": 5.8107,
+      "step": 180
+    },
+    {
+      "epoch": 0.02368,
+      "grad_norm": 3.5477242469787598,
+      "learning_rate": 9.998173487844396e-06,
+      "loss": 5.8335,
+      "step": 185
+    },
+    {
+      "epoch": 0.02432,
+      "grad_norm": 4.488218307495117,
+      "learning_rate": 9.998034647025349e-06,
+      "loss": 5.8285,
+      "step": 190
+    },
+    {
+      "epoch": 0.02496,
+      "grad_norm": 3.555074691772461,
+      "learning_rate": 9.997890721818844e-06,
+      "loss": 5.817,
+      "step": 195
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 3.6248419284820557,
+      "learning_rate": 9.99774171237129e-06,
+      "loss": 5.8368,
+      "step": 200
+    },
+    {
+      "epoch": 0.0256,
+      "eval_loss": 1.440572738647461,
+      "eval_runtime": 6.6468,
+      "eval_samples_per_second": 150.448,
+      "eval_steps_per_second": 18.806,
+      "step": 200
+    },
+    {
+      "epoch": 0.02624,
+      "grad_norm": 3.432421922683716,
+      "learning_rate": 9.997587618834272e-06,
+      "loss": 5.7842,
+      "step": 205
+    },
+    {
+      "epoch": 0.02688,
+      "grad_norm": 3.333038806915283,
+      "learning_rate": 9.997428441364546e-06,
+      "loss": 5.7173,
+      "step": 210
+    },
+    {
+      "epoch": 0.02752,
+      "grad_norm": 3.7716541290283203,
+      "learning_rate": 9.997264180124038e-06,
+      "loss": 5.719,
+      "step": 215
+    },
+    {
+      "epoch": 0.02816,
+      "grad_norm": 3.345600128173828,
+      "learning_rate": 9.99709483527985e-06,
+      "loss": 5.8428,
+      "step": 220
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 3.7677502632141113,
+      "learning_rate": 9.99692040700425e-06,
+      "loss": 5.7393,
+      "step": 225
+    },
+    {
+      "epoch": 0.02944,
+      "grad_norm": 11.996383666992188,
+      "learning_rate": 9.996740895474682e-06,
+      "loss": 5.5566,
+      "step": 230
+    },
+    {
+      "epoch": 0.03008,
+      "grad_norm": 3.6089084148406982,
+      "learning_rate": 9.996556300873758e-06,
+      "loss": 5.6939,
+      "step": 235
+    },
+    {
+      "epoch": 0.03072,
+      "grad_norm": 3.834825038909912,
+      "learning_rate": 9.996366623389263e-06,
+      "loss": 5.8123,
+      "step": 240
+    },
+    {
+      "epoch": 0.03136,
+      "grad_norm": 3.570263147354126,
+      "learning_rate": 9.99617186321415e-06,
+      "loss": 5.6839,
+      "step": 245
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 3.5728812217712402,
+      "learning_rate": 9.995972020546545e-06,
+      "loss": 5.7764,
+      "step": 250
+    },
+    {
+      "epoch": 0.03264,
+      "grad_norm": 3.4725637435913086,
+      "learning_rate": 9.995767095589743e-06,
+      "loss": 5.6879,
+      "step": 255
+    },
+    {
+      "epoch": 0.03328,
+      "grad_norm": 3.811537742614746,
+      "learning_rate": 9.99555708855221e-06,
+      "loss": 5.6418,
+      "step": 260
+    },
+    {
+      "epoch": 0.03392,
+      "grad_norm": 3.494992971420288,
+      "learning_rate": 9.99534199964758e-06,
+      "loss": 5.6927,
+      "step": 265
+    },
+    {
+      "epoch": 0.03456,
+      "grad_norm": 3.8107383251190186,
+      "learning_rate": 9.995121829094662e-06,
+      "loss": 5.5658,
+      "step": 270
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 3.570551633834839,
+      "learning_rate": 9.994896577117425e-06,
+      "loss": 5.8131,
+      "step": 275
+    },
+    {
+      "epoch": 0.03584,
+      "grad_norm": 3.540811538696289,
+      "learning_rate": 9.994666243945018e-06,
+      "loss": 5.6009,
+      "step": 280
+    },
+    {
+      "epoch": 0.03648,
+      "grad_norm": 3.7275819778442383,
+      "learning_rate": 9.99443082981175e-06,
+      "loss": 5.6407,
+      "step": 285
+    },
+    {
+      "epoch": 0.03712,
+      "grad_norm": 4.194495677947998,
+      "learning_rate": 9.994190334957103e-06,
+      "loss": 5.8319,
+      "step": 290
+    },
+    {
+      "epoch": 0.03776,
+      "grad_norm": 3.5107626914978027,
+      "learning_rate": 9.993944759625728e-06,
+      "loss": 5.5765,
+      "step": 295
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 3.4100208282470703,
+      "learning_rate": 9.993694104067444e-06,
+      "loss": 5.7473,
+      "step": 300
+    },
+    {
+      "epoch": 0.0384,
+      "eval_loss": 1.407908320426941,
+      "eval_runtime": 6.6542,
+      "eval_samples_per_second": 150.281,
+      "eval_steps_per_second": 18.785,
+      "step": 300
+    },
+    {
+      "epoch": 0.03904,
+      "grad_norm": 3.7727818489074707,
+      "learning_rate": 9.993438368537236e-06,
+      "loss": 5.6802,
+      "step": 305
+    },
+    {
+      "epoch": 0.03968,
+      "grad_norm": 3.445909023284912,
+      "learning_rate": 9.993177553295258e-06,
+      "loss": 5.7484,
+      "step": 310
+    },
+    {
+      "epoch": 0.04032,
+      "grad_norm": 3.4199888706207275,
+      "learning_rate": 9.992911658606832e-06,
+      "loss": 5.7648,
+      "step": 315
+    },
+    {
+      "epoch": 0.04096,
+      "grad_norm": 4.9640655517578125,
+      "learning_rate": 9.992640684742445e-06,
+      "loss": 5.7922,
+      "step": 320
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 3.3730976581573486,
+      "learning_rate": 9.992364631977754e-06,
+      "loss": 5.677,
+      "step": 325
+    },
+    {
+      "epoch": 0.04224,
+      "grad_norm": 3.540597915649414,
+      "learning_rate": 9.99208350059358e-06,
+      "loss": 5.5495,
+      "step": 330
+    },
+    {
+      "epoch": 0.04288,
+      "grad_norm": 3.6853768825531006,
+      "learning_rate": 9.991797290875915e-06,
+      "loss": 5.4089,
+      "step": 335
+    },
+    {
+      "epoch": 0.04352,
+      "grad_norm": 3.6380045413970947,
+      "learning_rate": 9.991506003115911e-06,
+      "loss": 5.4849,
+      "step": 340
+    },
+    {
+      "epoch": 0.04416,
+      "grad_norm": 3.265488862991333,
+      "learning_rate": 9.991209637609887e-06,
+      "loss": 5.523,
+      "step": 345
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 3.2634189128875732,
+      "learning_rate": 9.990908194659332e-06,
+      "loss": 5.5664,
+      "step": 350
+    },
+    {
+      "epoch": 0.04544,
+      "grad_norm": 3.569810152053833,
+      "learning_rate": 9.990601674570895e-06,
+      "loss": 5.5059,
+      "step": 355
+    },
+    {
+      "epoch": 0.04608,
+      "grad_norm": 3.580211877822876,
+      "learning_rate": 9.990290077656393e-06,
+      "loss": 5.4079,
+      "step": 360
+    },
+    {
+      "epoch": 0.04672,
+      "grad_norm": 3.4860317707061768,
+      "learning_rate": 9.989973404232805e-06,
+      "loss": 5.6858,
+      "step": 365
+    },
+    {
+      "epoch": 0.04736,
+      "grad_norm": 4.026730060577393,
+      "learning_rate": 9.989651654622277e-06,
+      "loss": 5.5662,
+      "step": 370
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 3.364692449569702,
+      "learning_rate": 9.989324829152119e-06,
+      "loss": 5.5304,
+      "step": 375
+    },
+    {
+      "epoch": 0.04864,
+      "grad_norm": 3.611964464187622,
+      "learning_rate": 9.9889929281548e-06,
+      "loss": 5.3911,
+      "step": 380
+    },
+    {
+      "epoch": 0.04928,
+      "grad_norm": 3.2946035861968994,
+      "learning_rate": 9.988655951967958e-06,
+      "loss": 5.4102,
+      "step": 385
+    },
+    {
+      "epoch": 0.04992,
+      "grad_norm": 3.963909864425659,
+      "learning_rate": 9.98831390093439e-06,
+      "loss": 5.549,
+      "step": 390
+    },
+    {
+      "epoch": 0.05056,
+      "grad_norm": 3.2876341342926025,
+      "learning_rate": 9.987966775402056e-06,
+      "loss": 5.5388,
+      "step": 395
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 3.8467471599578857,
+      "learning_rate": 9.98761457572408e-06,
+      "loss": 5.454,
+      "step": 400
+    },
+    {
+      "epoch": 0.0512,
+      "eval_loss": 1.3826359510421753,
+      "eval_runtime": 7.0199,
+      "eval_samples_per_second": 142.452,
+      "eval_steps_per_second": 17.807,
+      "step": 400
+    },
+    {
+      "epoch": 0.05184,
+      "grad_norm": 3.675231695175171,
+      "learning_rate": 9.987257302258748e-06,
+      "loss": 5.674,
+      "step": 405
+    },
+    {
+      "epoch": 0.05248,
+      "grad_norm": 3.787940263748169,
+      "learning_rate": 9.986894955369504e-06,
+      "loss": 5.5466,
+      "step": 410
+    },
+    {
+      "epoch": 0.05312,
+      "grad_norm": 3.677966833114624,
+      "learning_rate": 9.986527535424956e-06,
+      "loss": 5.4762,
+      "step": 415
+    },
+    {
+      "epoch": 0.05376,
+      "grad_norm": 3.5083606243133545,
+      "learning_rate": 9.986155042798874e-06,
+      "loss": 5.3145,
+      "step": 420
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 3.536379098892212,
+      "learning_rate": 9.98577747787018e-06,
+      "loss": 5.3769,
+      "step": 425
+    },
+    {
+      "epoch": 0.05504,
+      "grad_norm": 3.5448412895202637,
+      "learning_rate": 9.98539484102297e-06,
+      "loss": 5.3996,
+      "step": 430
+    },
+    {
+      "epoch": 0.05568,
+      "grad_norm": 3.359647274017334,
+      "learning_rate": 9.985007132646489e-06,
+      "loss": 5.3114,
+      "step": 435
+    },
+    {
+      "epoch": 0.05632,
+      "grad_norm": 3.3419110774993896,
+      "learning_rate": 9.984614353135143e-06,
+      "loss": 5.4383,
+      "step": 440
+    },
+    {
+      "epoch": 0.05696,
+      "grad_norm": 3.558025360107422,
+      "learning_rate": 9.984216502888496e-06,
+      "loss": 5.5239,
+      "step": 445
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 3.6349422931671143,
+      "learning_rate": 9.983813582311277e-06,
+      "loss": 5.5639,
+      "step": 450
+    },
+    {
+      "epoch": 0.05824,
+      "grad_norm": 3.2916922569274902,
+      "learning_rate": 9.983405591813362e-06,
+      "loss": 5.3886,
+      "step": 455
+    },
+    {
+      "epoch": 0.05888,
+      "grad_norm": 3.32891845703125,
+      "learning_rate": 9.982992531809796e-06,
+      "loss": 5.526,
+      "step": 460
+    },
+    {
+      "epoch": 0.05952,
+      "grad_norm": 3.8752880096435547,
+      "learning_rate": 9.982574402720773e-06,
+      "loss": 5.6599,
+      "step": 465
+    },
+    {
+      "epoch": 0.06016,
+      "grad_norm": 3.604433536529541,
+      "learning_rate": 9.982151204971646e-06,
+      "loss": 5.4567,
+      "step": 470
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 3.3058159351348877,
+      "learning_rate": 9.981722938992926e-06,
+      "loss": 5.4981,
+      "step": 475
+    },
+    {
+      "epoch": 0.06144,
+      "grad_norm": 3.7341926097869873,
+      "learning_rate": 9.981289605220276e-06,
+      "loss": 5.3278,
+      "step": 480
+    },
+    {
+      "epoch": 0.06208,
+      "grad_norm": 3.51798415184021,
+      "learning_rate": 9.980851204094519e-06,
+      "loss": 5.5029,
+      "step": 485
+    },
+    {
+      "epoch": 0.06272,
+      "grad_norm": 3.6541428565979004,
+      "learning_rate": 9.980407736061629e-06,
+      "loss": 5.3987,
+      "step": 490
+    },
+    {
+      "epoch": 0.06336,
+      "grad_norm": 3.420767307281494,
+      "learning_rate": 9.979959201572736e-06,
+      "loss": 5.405,
+      "step": 495
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 3.7169559001922607,
+      "learning_rate": 9.979505601084124e-06,
+      "loss": 5.498,
+      "step": 500
+    },
+    {
+      "epoch": 0.064,
+      "eval_loss": 1.3493109941482544,
+      "eval_runtime": 7.1309,
+      "eval_samples_per_second": 140.234,
+      "eval_steps_per_second": 17.529,
+      "step": 500
+    },
+    {
+      "epoch": 0.06464,
+      "grad_norm": 4.536627769470215,
+      "learning_rate": 9.97904693505723e-06,
+      "loss": 5.5237,
+      "step": 505
+    },
+    {
+      "epoch": 0.06528,
+      "grad_norm": 3.204948902130127,
+      "learning_rate": 9.978583203958649e-06,
+      "loss": 5.3746,
+      "step": 510
+    },
+    {
+      "epoch": 0.06592,
+      "grad_norm": 3.4658005237579346,
+      "learning_rate": 9.978114408260118e-06,
+      "loss": 5.4567,
+      "step": 515
+    },
+    {
+      "epoch": 0.06656,
+      "grad_norm": 4.932333469390869,
+      "learning_rate": 9.977640548438534e-06,
+      "loss": 5.1959,
+      "step": 520
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 3.4697563648223877,
+      "learning_rate": 9.977161624975948e-06,
+      "loss": 5.4013,
+      "step": 525
+    },
+    {
+      "epoch": 0.06784,
+      "grad_norm": 3.441819667816162,
+      "learning_rate": 9.976677638359553e-06,
+      "loss": 5.4899,
+      "step": 530
+    },
+    {
+      "epoch": 0.06848,
+      "grad_norm": 3.4293930530548096,
+      "learning_rate": 9.9761885890817e-06,
+      "loss": 5.3569,
+      "step": 535
+    },
+    {
+      "epoch": 0.06912,
+      "grad_norm": 3.5388574600219727,
+      "learning_rate": 9.975694477639885e-06,
+      "loss": 5.2739,
+      "step": 540
+    },
+    {
+      "epoch": 0.06976,
+      "grad_norm": 3.735548973083496,
+      "learning_rate": 9.97519530453676e-06,
+      "loss": 5.4253,
+      "step": 545
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 3.33503794670105,
+      "learning_rate": 9.974691070280121e-06,
+      "loss": 5.1569,
+      "step": 550
+    },
+    {
+      "epoch": 0.07104,
+      "grad_norm": 3.5171401500701904,
+      "learning_rate": 9.974181775382915e-06,
+      "loss": 5.3242,
+      "step": 555
+    },
+    {
+      "epoch": 0.07168,
+      "grad_norm": 3.565356969833374,
+      "learning_rate": 9.973667420363233e-06,
+      "loss": 5.3893,
+      "step": 560
+    },
+    {
+      "epoch": 0.07232,
+      "grad_norm": 3.172163248062134,
+      "learning_rate": 9.973148005744319e-06,
+      "loss": 5.3824,
+      "step": 565
+    },
+    {
+      "epoch": 0.07296,
+      "grad_norm": 3.517838716506958,
+      "learning_rate": 9.972623532054564e-06,
+      "loss": 5.2673,
+      "step": 570
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 3.328416585922241,
+      "learning_rate": 9.9720939998275e-06,
+      "loss": 5.2649,
+      "step": 575
+    },
+    {
+      "epoch": 0.07424,
+      "grad_norm": 3.475539445877075,
+      "learning_rate": 9.971559409601807e-06,
+      "loss": 5.3318,
+      "step": 580
+    },
+    {
+      "epoch": 0.07488,
+      "grad_norm": 3.492013692855835,
+      "learning_rate": 9.971019761921317e-06,
+      "loss": 5.2735,
+      "step": 585
+    },
+    {
+      "epoch": 0.07552,
+      "grad_norm": 3.474803924560547,
+      "learning_rate": 9.970475057334997e-06,
+      "loss": 5.3722,
+      "step": 590
+    },
+    {
+      "epoch": 0.07616,
+      "grad_norm": 3.4162726402282715,
+      "learning_rate": 9.96992529639696e-06,
+      "loss": 5.3901,
+      "step": 595
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 3.3643155097961426,
+      "learning_rate": 9.969370479666473e-06,
+      "loss": 5.2384,
+      "step": 600
+    },
+    {
+      "epoch": 0.0768,
+      "eval_loss": 1.3373793363571167,
+      "eval_runtime": 6.5847,
+      "eval_samples_per_second": 151.867,
+      "eval_steps_per_second": 18.983,
+      "step": 600
+    },
+    {
+      "epoch": 0.07744,
+      "grad_norm": 3.44301176071167,
+      "learning_rate": 9.968810607707933e-06,
+      "loss": 5.2322,
+      "step": 605
+    },
+    {
+      "epoch": 0.07808,
+      "grad_norm": 3.422262668609619,
+      "learning_rate": 9.968245681090887e-06,
+      "loss": 5.1708,
+      "step": 610
+    },
+    {
+      "epoch": 0.07872,
+      "grad_norm": 3.2879252433776855,
+      "learning_rate": 9.96767570039002e-06,
+      "loss": 5.2291,
+      "step": 615
+    },
+    {
+      "epoch": 0.07936,
+      "grad_norm": 3.6026480197906494,
+      "learning_rate": 9.967100666185163e-06,
+      "loss": 5.4241,
+      "step": 620
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 3.3642101287841797,
+      "learning_rate": 9.966520579061286e-06,
+      "loss": 5.4473,
+      "step": 625
+    },
+    {
+      "epoch": 0.08064,
+      "grad_norm": 3.5968470573425293,
+      "learning_rate": 9.965935439608493e-06,
+      "loss": 5.3982,
+      "step": 630
+    },
+    {
+      "epoch": 0.08128,
+      "grad_norm": 3.352083206176758,
+      "learning_rate": 9.96534524842204e-06,
+      "loss": 5.3953,
+      "step": 635
+    },
+    {
+      "epoch": 0.08192,
+      "grad_norm": 3.3571720123291016,
+      "learning_rate": 9.964750006102311e-06,
+      "loss": 5.3159,
+      "step": 640
+    },
+    {
+      "epoch": 0.08256,
+      "grad_norm": 3.486246109008789,
+      "learning_rate": 9.964149713254833e-06,
+      "loss": 5.211,
+      "step": 645
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 3.674906015396118,
+      "learning_rate": 9.96354437049027e-06,
+      "loss": 5.3374,
+      "step": 650
+    },
+    {
+      "epoch": 0.08384,
+      "grad_norm": 3.590810537338257,
+      "learning_rate": 9.962933978424426e-06,
+      "loss": 5.2194,
+      "step": 655
+    },
+    {
+      "epoch": 0.08448,
+      "grad_norm": 3.551786184310913,
+      "learning_rate": 9.962318537678238e-06,
+      "loss": 5.1187,
+      "step": 660
+    },
+    {
+      "epoch": 0.08512,
+      "grad_norm": 3.5391581058502197,
+      "learning_rate": 9.961698048877776e-06,
+      "loss": 5.2001,
+      "step": 665
+    },
+    {
+      "epoch": 0.08576,
+      "grad_norm": 3.6105592250823975,
+      "learning_rate": 9.961072512654255e-06,
+      "loss": 5.2758,
+      "step": 670
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 3.7463858127593994,
+      "learning_rate": 9.960441929644017e-06,
+      "loss": 5.2137,
+      "step": 675
+    },
+    {
+      "epoch": 0.08704,
+      "grad_norm": 3.9237470626831055,
+      "learning_rate": 9.959806300488538e-06,
+      "loss": 5.2047,
+      "step": 680
+    },
+    {
+      "epoch": 0.08768,
+      "grad_norm": 3.392827272415161,
+      "learning_rate": 9.95916562583443e-06,
+      "loss": 5.3071,
+      "step": 685
+    },
+    {
+      "epoch": 0.08832,
+      "grad_norm": 3.221484661102295,
+      "learning_rate": 9.958519906333438e-06,
+      "loss": 5.183,
+      "step": 690
+    },
+    {
+      "epoch": 0.08896,
+      "grad_norm": 3.5143983364105225,
+      "learning_rate": 9.957869142642437e-06,
+      "loss": 5.3171,
+      "step": 695
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 3.497072696685791,
+      "learning_rate": 9.957213335423433e-06,
+      "loss": 5.1784,
+      "step": 700
+    },
+    {
+      "epoch": 0.0896,
+      "eval_loss": 1.2988511323928833,
+      "eval_runtime": 6.9763,
+      "eval_samples_per_second": 143.342,
+      "eval_steps_per_second": 17.918,
+      "step": 700
+    },
+    {
+      "epoch": 0.09024,
+      "grad_norm": 3.3822438716888428,
+      "learning_rate": 9.956552485343566e-06,
+      "loss": 5.1732,
+      "step": 705
+    },
+    {
+      "epoch": 0.09088,
+      "grad_norm": 3.3949694633483887,
+      "learning_rate": 9.955886593075101e-06,
+      "loss": 5.2725,
+      "step": 710
+    },
+    {
+      "epoch": 0.09152,
+      "grad_norm": 3.2577288150787354,
+      "learning_rate": 9.955215659295438e-06,
+      "loss": 5.2207,
+      "step": 715
+    },
+    {
+      "epoch": 0.09216,
+      "grad_norm": 3.769519567489624,
+      "learning_rate": 9.954539684687103e-06,
+      "loss": 5.2152,
+      "step": 720
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 3.3824892044067383,
+      "learning_rate": 9.953858669937746e-06,
+      "loss": 5.2085,
+      "step": 725
+    },
+    {
+      "epoch": 0.09344,
+      "grad_norm": 3.771742105484009,
+      "learning_rate": 9.953172615740152e-06,
+      "loss": 5.1575,
+      "step": 730
+    },
+    {
+      "epoch": 0.09408,
+      "grad_norm": 3.7706689834594727,
+      "learning_rate": 9.952481522792226e-06,
+      "loss": 4.9608,
+      "step": 735
+    },
+    {
+      "epoch": 0.09472,
+      "grad_norm": 3.8110334873199463,
+      "learning_rate": 9.951785391797001e-06,
+      "loss": 5.21,
+      "step": 740
+    },
+    {
+      "epoch": 0.09536,
+      "grad_norm": 3.3012993335723877,
+      "learning_rate": 9.951084223462636e-06,
+      "loss": 5.2475,
+      "step": 745
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 3.6353518962860107,
+      "learning_rate": 9.950378018502415e-06,
+      "loss": 5.0985,
+      "step": 750
+    },
+    {
+      "epoch": 0.09664,
+      "grad_norm": 3.369378089904785,
+      "learning_rate": 9.949666777634743e-06,
+      "loss": 5.1986,
+      "step": 755
+    },
+    {
+      "epoch": 0.09728,
+      "grad_norm": 3.2247676849365234,
+      "learning_rate": 9.948950501583147e-06,
+      "loss": 5.3192,
+      "step": 760
+    },
+    {
+      "epoch": 0.09792,
+      "grad_norm": 3.6966888904571533,
+      "learning_rate": 9.948229191076284e-06,
+      "loss": 5.1654,
+      "step": 765
+    },
+    {
+      "epoch": 0.09856,
+      "grad_norm": 3.5823962688446045,
+      "learning_rate": 9.947502846847921e-06,
+      "loss": 5.1351,
+      "step": 770
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 3.5258729457855225,
+      "learning_rate": 9.946771469636955e-06,
+      "loss": 5.1745,
+      "step": 775
+    },
+    {
+      "epoch": 0.09984,
+      "grad_norm": 3.42067813873291,
+      "learning_rate": 9.946035060187398e-06,
+      "loss": 5.1569,
+      "step": 780
+    },
+    {
+      "epoch": 0.10048,
+      "grad_norm": 3.9832825660705566,
+      "learning_rate": 9.945293619248383e-06,
+      "loss": 4.9796,
+      "step": 785
+    },
+    {
+      "epoch": 0.10112,
+      "grad_norm": 3.742013692855835,
+      "learning_rate": 9.944547147574162e-06,
+      "loss": 5.1625,
+      "step": 790
+    },
+    {
+      "epoch": 0.10176,
+      "grad_norm": 3.3150367736816406,
+      "learning_rate": 9.943795645924104e-06,
+      "loss": 5.099,
+      "step": 795
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 3.359069585800171,
+      "learning_rate": 9.943039115062691e-06,
+      "loss": 5.1877,
+      "step": 800
+    },
+    {
+      "epoch": 0.1024,
+      "eval_loss": 1.2946017980575562,
+      "eval_runtime": 7.4306,
+      "eval_samples_per_second": 134.579,
+      "eval_steps_per_second": 16.822,
+      "step": 800
+    },
+    {
+      "epoch": 0.10304,
+      "grad_norm": 3.703000545501709,
+      "learning_rate": 9.94227755575953e-06,
+      "loss": 5.1581,
+      "step": 805
+    },
+    {
+      "epoch": 0.10368,
+      "grad_norm": 3.5370070934295654,
+      "learning_rate": 9.941510968789334e-06,
+      "loss": 5.2402,
+      "step": 810
+    },
+    {
+      "epoch": 0.10432,
+      "grad_norm": 3.5010828971862793,
+      "learning_rate": 9.940739354931936e-06,
+      "loss": 5.1828,
+      "step": 815
+    },
+    {
+      "epoch": 0.10496,
+      "grad_norm": 3.4637820720672607,
+      "learning_rate": 9.93996271497228e-06,
+      "loss": 5.1792,
+      "step": 820
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 3.409712076187134,
+      "learning_rate": 9.939181049700427e-06,
+      "loss": 5.0721,
+      "step": 825
+    },
+    {
+      "epoch": 0.10624,
+      "grad_norm": 3.589414596557617,
+      "learning_rate": 9.938394359911545e-06,
+      "loss": 5.234,
+      "step": 830
+    },
+    {
+      "epoch": 0.10688,
+      "grad_norm": 3.444977045059204,
+      "learning_rate": 9.937602646405918e-06,
+      "loss": 4.9763,
+      "step": 835
+    },
+    {
+      "epoch": 0.10752,
+      "grad_norm": 3.3560900688171387,
+      "learning_rate": 9.936805909988935e-06,
+      "loss": 5.2006,
+      "step": 840
+    },
+    {
+      "epoch": 0.10816,
+      "grad_norm": 3.345703601837158,
+      "learning_rate": 9.9360041514711e-06,
+      "loss": 5.0287,
+      "step": 845
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 3.492363691329956,
+      "learning_rate": 9.935197371668024e-06,
+      "loss": 5.0908,
+      "step": 850
+    },
+    {
+      "epoch": 0.10944,
+      "grad_norm": 7.459951400756836,
+      "learning_rate": 9.934385571400425e-06,
+      "loss": 5.1735,
+      "step": 855
+    },
+    {
+      "epoch": 0.11008,
+      "grad_norm": 3.5033841133117676,
+      "learning_rate": 9.933568751494131e-06,
+      "loss": 5.053,
+      "step": 860
+    },
+    {
+      "epoch": 0.11072,
+      "grad_norm": 3.5542259216308594,
+      "learning_rate": 9.93274691278007e-06,
+      "loss": 5.1463,
+      "step": 865
+    },
+    {
+      "epoch": 0.11136,
+      "grad_norm": 3.3819243907928467,
+      "learning_rate": 9.931920056094285e-06,
+      "loss": 5.0397,
+      "step": 870
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 3.406768798828125,
+      "learning_rate": 9.931088182277915e-06,
+      "loss": 5.179,
+      "step": 875
+    },
+    {
+      "epoch": 0.11264,
+      "grad_norm": 5.960773944854736,
+      "learning_rate": 9.930251292177206e-06,
+      "loss": 5.217,
+      "step": 880
+    },
+    {
+      "epoch": 0.11328,
+      "grad_norm": 3.5821049213409424,
+      "learning_rate": 9.929409386643511e-06,
+      "loss": 5.0374,
+      "step": 885
+    },
+    {
+      "epoch": 0.11392,
+      "grad_norm": 3.3204903602600098,
+      "learning_rate": 9.928562466533279e-06,
+      "loss": 5.1856,
+      "step": 890
+    },
+    {
+      "epoch": 0.11456,
+      "grad_norm": 4.022350788116455,
+      "learning_rate": 9.927710532708064e-06,
+      "loss": 5.1051,
+      "step": 895
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 3.3810718059539795,
+      "learning_rate": 9.926853586034515e-06,
+      "loss": 5.1691,
+      "step": 900
+    },
+    {
+      "epoch": 0.1152,
+      "eval_loss": 1.2660380601882935,
+      "eval_runtime": 6.8853,
+      "eval_samples_per_second": 145.238,
+      "eval_steps_per_second": 18.155,
+      "step": 900
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 15624,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.141806761967616e+17,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f6b2a903b8a705cf6470fa099fd4e529608302cc9f11777eca1699f316b366f
+size 5240