Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

added_tokens.json +5 -0
gliner_config.json +144 -0
pytorch_model.bin +3 -0
special_tokens_map.json +15 -0
spm.model +3 -0
tokenizer.json +0 -0
tokenizer_config.json +75 -0
trainer_state.json +1465 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<<ENT>>": 128001,
+  "<<SEP>>": 128002,
+  "[MASK]": 128000
+}

gliner_config.json ADDED Viewed

	@@ -0,0 +1,144 @@

+{
+  "class_token_index": 128001,
+  "dropout": 0.4,
+  "embed_ent_token": true,
+  "encoder_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "microsoft/deberta-v3-large",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_probs_dropout_prob": 0.1,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-07,
+    "legacy": true,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 512,
+    "max_relative_positions": -1,
+    "min_length": 0,
+    "model_type": "deberta-v2",
+    "no_repeat_ngram_size": 0,
+    "norm_rel_ebd": "layer_norm",
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 0,
+    "pooler_dropout": 0,
+    "pooler_hidden_act": "gelu",
+    "pooler_hidden_size": 1024,
+    "pos_att_type": [
+      "p2c",
+      "c2p"
+    ],
+    "position_biased_input": false,
+    "position_buckets": 256,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "relative_attention": true,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "share_att_key": true,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "type_vocab_size": 0,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vocab_size": 128003
+  },
+  "ent_token": "<<ENT>>",
+  "eval_every": 10000,
+  "fine_tune": true,
+  "freeze_token_rep": false,
+  "fuse_layers": false,
+  "has_rnn": true,
+  "hidden_size": 768,
+  "label_smoothing": 0.0,
+  "labels_encoder": null,
+  "labels_encoder_config": null,
+  "labels_fusion_schema": "",
+  "log_dir": "models/",
+  "loss_alpha": 0.75,
+  "loss_gamma": -1,
+  "loss_reduction": "sum",
+  "lr_encoder": "1e-5",
+  "lr_others": "5e-5",
+  "max_grad_norm": 1.0,
+  "max_len": 1024,
+  "max_neg_type_ratio": 1,
+  "max_types": 30,
+  "max_width": 12,
+  "model_name": "microsoft/deberta-v3-large",
+  "model_type": "gliner",
+  "name": "span level gliner",
+  "num_post_fusion_layers": 1,
+  "num_rnns": 1,
+  "num_steps": 20000,
+  "post_fusion_schema": "",
+  "pre_fusion": true,
+  "prev_path": null,
+  "random_drop": true,
+  "root_dir": "gliner_logs",
+  "save_total_limit": 5,
+  "scheduler_type": "linear",
+  "sep_token": "<<SEP>>",
+  "shuffle_types": true,
+  "size_sup": -1,
+  "span_mode": "markerV0",
+  "subtoken_pooling": "first",
+  "train_batch_size": 4,
+  "train_data": "../data/post_ner.json",
+  "transformers_version": "4.49.0",
+  "val_data_dir": "none",
+  "vocab_size": 128003,
+  "warmup_ratio": 0.1,
+  "weight_decay_encoder": 0.01,
+  "weight_decay_other": 0.01,
+  "words_splitter_type": "whitespace"
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2bcd21b4a1f5c6c52e6482bb12b380d8e2e8c15dc3747ad1064343b4e77ac23
+size 1838129390

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "[CLS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
+size 2464616

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128000": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<<ENT>>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128002": {
+      "content": "<<SEP>>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "split_by_punct": false,
+  "tokenizer_class": "DebertaV2Tokenizer",
+  "unk_token": "[UNK]",
+  "vocab_type": "spm"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1465 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.423800044238001,
+  "eval_steps": 500,
+  "global_step": 20000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.022119000221190004,
+      "grad_norm": 329782.40625,
+      "learning_rate": 2.5e-06,
+      "loss": 11183.8312,
+      "step": 100
+    },
+    {
+      "epoch": 0.04423800044238001,
+      "grad_norm": 578.6302490234375,
+      "learning_rate": 5e-06,
+      "loss": 501.9779,
+      "step": 200
+    },
+    {
+      "epoch": 0.06635700066357,
+      "grad_norm": 3675.665283203125,
+      "learning_rate": 7.5e-06,
+      "loss": 299.6076,
+      "step": 300
+    },
+    {
+      "epoch": 0.08847600088476001,
+      "grad_norm": 2041.4144287109375,
+      "learning_rate": 1e-05,
+      "loss": 327.6098,
+      "step": 400
+    },
+    {
+      "epoch": 0.11059500110595001,
+      "grad_norm": 229.52532958984375,
+      "learning_rate": 1.25e-05,
+      "loss": 383.8903,
+      "step": 500
+    },
+    {
+      "epoch": 0.13271400132714,
+      "grad_norm": 839.3260498046875,
+      "learning_rate": 1.5e-05,
+      "loss": 229.51,
+      "step": 600
+    },
+    {
+      "epoch": 0.15483300154833002,
+      "grad_norm": 2449.822021484375,
+      "learning_rate": 1.75e-05,
+      "loss": 393.3861,
+      "step": 700
+    },
+    {
+      "epoch": 0.17695200176952003,
+      "grad_norm": 596.9613037109375,
+      "learning_rate": 2e-05,
+      "loss": 170.8953,
+      "step": 800
+    },
+    {
+      "epoch": 0.19907100199071,
+      "grad_norm": 718.4219970703125,
+      "learning_rate": 2.25e-05,
+      "loss": 270.5029,
+      "step": 900
+    },
+    {
+      "epoch": 0.22119000221190002,
+      "grad_norm": 765.644775390625,
+      "learning_rate": 2.5e-05,
+      "loss": 240.2428,
+      "step": 1000
+    },
+    {
+      "epoch": 0.24330900243309003,
+      "grad_norm": 757.395751953125,
+      "learning_rate": 2.7500000000000004e-05,
+      "loss": 202.8483,
+      "step": 1100
+    },
+    {
+      "epoch": 0.26542800265428,
+      "grad_norm": 2641.68408203125,
+      "learning_rate": 3e-05,
+      "loss": 197.4493,
+      "step": 1200
+    },
+    {
+      "epoch": 0.28754700287547,
+      "grad_norm": 295.59307861328125,
+      "learning_rate": 3.2500000000000004e-05,
+      "loss": 166.7725,
+      "step": 1300
+    },
+    {
+      "epoch": 0.30966600309666004,
+      "grad_norm": 735.7752075195312,
+      "learning_rate": 3.5e-05,
+      "loss": 157.2355,
+      "step": 1400
+    },
+    {
+      "epoch": 0.33178500331785005,
+      "grad_norm": 596.8727416992188,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 206.7806,
+      "step": 1500
+    },
+    {
+      "epoch": 0.35390400353904006,
+      "grad_norm": 1486.5274658203125,
+      "learning_rate": 4e-05,
+      "loss": 187.7721,
+      "step": 1600
+    },
+    {
+      "epoch": 0.37602300376023,
+      "grad_norm": 1032.82763671875,
+      "learning_rate": 4.25e-05,
+      "loss": 186.7065,
+      "step": 1700
+    },
+    {
+      "epoch": 0.39814200398142,
+      "grad_norm": 563.8107299804688,
+      "learning_rate": 4.5e-05,
+      "loss": 182.7577,
+      "step": 1800
+    },
+    {
+      "epoch": 0.42026100420261003,
+      "grad_norm": 287.0451354980469,
+      "learning_rate": 4.75e-05,
+      "loss": 279.2049,
+      "step": 1900
+    },
+    {
+      "epoch": 0.44238000442380004,
+      "grad_norm": 984.99853515625,
+      "learning_rate": 5e-05,
+      "loss": 126.9518,
+      "step": 2000
+    },
+    {
+      "epoch": 0.46449900464499005,
+      "grad_norm": 3091.0703125,
+      "learning_rate": 4.972222222222223e-05,
+      "loss": 172.3027,
+      "step": 2100
+    },
+    {
+      "epoch": 0.48661800486618007,
+      "grad_norm": 3833.43798828125,
+      "learning_rate": 4.9444444444444446e-05,
+      "loss": 252.3367,
+      "step": 2200
+    },
+    {
+      "epoch": 0.5087370050873701,
+      "grad_norm": 765.114013671875,
+      "learning_rate": 4.9166666666666665e-05,
+      "loss": 226.8686,
+      "step": 2300
+    },
+    {
+      "epoch": 0.53085600530856,
+      "grad_norm": 452.7674255371094,
+      "learning_rate": 4.888888888888889e-05,
+      "loss": 134.841,
+      "step": 2400
+    },
+    {
+      "epoch": 0.5529750055297501,
+      "grad_norm": 836.827880859375,
+      "learning_rate": 4.8611111111111115e-05,
+      "loss": 225.3301,
+      "step": 2500
+    },
+    {
+      "epoch": 0.57509400575094,
+      "grad_norm": 363.256591796875,
+      "learning_rate": 4.8333333333333334e-05,
+      "loss": 148.2828,
+      "step": 2600
+    },
+    {
+      "epoch": 0.59721300597213,
+      "grad_norm": 501.0595397949219,
+      "learning_rate": 4.805555555555556e-05,
+      "loss": 93.7487,
+      "step": 2700
+    },
+    {
+      "epoch": 0.6193320061933201,
+      "grad_norm": 334.7666931152344,
+      "learning_rate": 4.7777777777777784e-05,
+      "loss": 88.7822,
+      "step": 2800
+    },
+    {
+      "epoch": 0.64145100641451,
+      "grad_norm": 503.07720947265625,
+      "learning_rate": 4.75e-05,
+      "loss": 253.4651,
+      "step": 2900
+    },
+    {
+      "epoch": 0.6635700066357001,
+      "grad_norm": 53895.6015625,
+      "learning_rate": 4.722222222222222e-05,
+      "loss": 147.7988,
+      "step": 3000
+    },
+    {
+      "epoch": 0.68568900685689,
+      "grad_norm": 828.54296875,
+      "learning_rate": 4.6944444444444446e-05,
+      "loss": 138.5155,
+      "step": 3100
+    },
+    {
+      "epoch": 0.7078080070780801,
+      "grad_norm": 278.7898864746094,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 132.9807,
+      "step": 3200
+    },
+    {
+      "epoch": 0.7299270072992701,
+      "grad_norm": 330.3460693359375,
+      "learning_rate": 4.638888888888889e-05,
+      "loss": 118.2184,
+      "step": 3300
+    },
+    {
+      "epoch": 0.75204600752046,
+      "grad_norm": 1073.325439453125,
+      "learning_rate": 4.6111111111111115e-05,
+      "loss": 131.6948,
+      "step": 3400
+    },
+    {
+      "epoch": 0.7741650077416501,
+      "grad_norm": 679.9262084960938,
+      "learning_rate": 4.5833333333333334e-05,
+      "loss": 116.4453,
+      "step": 3500
+    },
+    {
+      "epoch": 0.79628400796284,
+      "grad_norm": 828.5316162109375,
+      "learning_rate": 4.555555555555556e-05,
+      "loss": 101.1161,
+      "step": 3600
+    },
+    {
+      "epoch": 0.8184030081840301,
+      "grad_norm": 26458.435546875,
+      "learning_rate": 4.527777777777778e-05,
+      "loss": 210.2304,
+      "step": 3700
+    },
+    {
+      "epoch": 0.8405220084052201,
+      "grad_norm": 485.78900146484375,
+      "learning_rate": 4.5e-05,
+      "loss": 138.0961,
+      "step": 3800
+    },
+    {
+      "epoch": 0.8626410086264101,
+      "grad_norm": 2006.3243408203125,
+      "learning_rate": 4.472222222222223e-05,
+      "loss": 88.3029,
+      "step": 3900
+    },
+    {
+      "epoch": 0.8847600088476001,
+      "grad_norm": 7547.01123046875,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 134.9624,
+      "step": 4000
+    },
+    {
+      "epoch": 0.90687900906879,
+      "grad_norm": 778.521728515625,
+      "learning_rate": 4.4166666666666665e-05,
+      "loss": 115.0598,
+      "step": 4100
+    },
+    {
+      "epoch": 0.9289980092899801,
+      "grad_norm": 593.7921752929688,
+      "learning_rate": 4.388888888888889e-05,
+      "loss": 161.5183,
+      "step": 4200
+    },
+    {
+      "epoch": 0.9511170095111701,
+      "grad_norm": 2227.79248046875,
+      "learning_rate": 4.3611111111111116e-05,
+      "loss": 111.9892,
+      "step": 4300
+    },
+    {
+      "epoch": 0.9732360097323601,
+      "grad_norm": 422.0806579589844,
+      "learning_rate": 4.3333333333333334e-05,
+      "loss": 76.96,
+      "step": 4400
+    },
+    {
+      "epoch": 0.9953550099535501,
+      "grad_norm": 922.49169921875,
+      "learning_rate": 4.305555555555556e-05,
+      "loss": 64.7625,
+      "step": 4500
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 218.93350219726562,
+      "eval_runtime": 62.6127,
+      "eval_samples_per_second": 32.102,
+      "eval_steps_per_second": 8.034,
+      "step": 4521
+    },
+    {
+      "epoch": 1.0174740101747402,
+      "grad_norm": 1710.8536376953125,
+      "learning_rate": 4.277777777777778e-05,
+      "loss": 88.4653,
+      "step": 4600
+    },
+    {
+      "epoch": 1.03959301039593,
+      "grad_norm": 2913.376708984375,
+      "learning_rate": 4.25e-05,
+      "loss": 71.536,
+      "step": 4700
+    },
+    {
+      "epoch": 1.06171201061712,
+      "grad_norm": 170.909912109375,
+      "learning_rate": 4.222222222222222e-05,
+      "loss": 98.4133,
+      "step": 4800
+    },
+    {
+      "epoch": 1.08383101083831,
+      "grad_norm": 1084.526611328125,
+      "learning_rate": 4.194444444444445e-05,
+      "loss": 136.51,
+      "step": 4900
+    },
+    {
+      "epoch": 1.1059500110595002,
+      "grad_norm": 1848.7337646484375,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 92.1182,
+      "step": 5000
+    },
+    {
+      "epoch": 1.1280690112806901,
+      "grad_norm": 446.60064697265625,
+      "learning_rate": 4.138888888888889e-05,
+      "loss": 90.7799,
+      "step": 5100
+    },
+    {
+      "epoch": 1.15018801150188,
+      "grad_norm": 1472.4832763671875,
+      "learning_rate": 4.111111111111111e-05,
+      "loss": 91.3228,
+      "step": 5200
+    },
+    {
+      "epoch": 1.17230701172307,
+      "grad_norm": 2101.265625,
+      "learning_rate": 4.0833333333333334e-05,
+      "loss": 95.7063,
+      "step": 5300
+    },
+    {
+      "epoch": 1.1944260119442602,
+      "grad_norm": 99.53836059570312,
+      "learning_rate": 4.055555555555556e-05,
+      "loss": 74.0728,
+      "step": 5400
+    },
+    {
+      "epoch": 1.2165450121654502,
+      "grad_norm": 4299.07958984375,
+      "learning_rate": 4.027777777777778e-05,
+      "loss": 98.044,
+      "step": 5500
+    },
+    {
+      "epoch": 1.2386640123866401,
+      "grad_norm": 574.811279296875,
+      "learning_rate": 4e-05,
+      "loss": 83.4273,
+      "step": 5600
+    },
+    {
+      "epoch": 1.26078301260783,
+      "grad_norm": 792.8590087890625,
+      "learning_rate": 3.972222222222222e-05,
+      "loss": 138.9129,
+      "step": 5700
+    },
+    {
+      "epoch": 1.28290201282902,
+      "grad_norm": 1043.4178466796875,
+      "learning_rate": 3.944444444444445e-05,
+      "loss": 82.8142,
+      "step": 5800
+    },
+    {
+      "epoch": 1.3050210130502102,
+      "grad_norm": 793.00830078125,
+      "learning_rate": 3.9166666666666665e-05,
+      "loss": 80.1177,
+      "step": 5900
+    },
+    {
+      "epoch": 1.3271400132714002,
+      "grad_norm": 645.7152709960938,
+      "learning_rate": 3.888888888888889e-05,
+      "loss": 67.7325,
+      "step": 6000
+    },
+    {
+      "epoch": 1.3492590134925901,
+      "grad_norm": 284.2798767089844,
+      "learning_rate": 3.8611111111111116e-05,
+      "loss": 115.4644,
+      "step": 6100
+    },
+    {
+      "epoch": 1.37137801371378,
+      "grad_norm": 1235.421142578125,
+      "learning_rate": 3.8333333333333334e-05,
+      "loss": 85.7536,
+      "step": 6200
+    },
+    {
+      "epoch": 1.39349701393497,
+      "grad_norm": 861.7305908203125,
+      "learning_rate": 3.805555555555555e-05,
+      "loss": 72.2837,
+      "step": 6300
+    },
+    {
+      "epoch": 1.4156160141561602,
+      "grad_norm": 2955.749755859375,
+      "learning_rate": 3.777777777777778e-05,
+      "loss": 88.7108,
+      "step": 6400
+    },
+    {
+      "epoch": 1.4377350143773502,
+      "grad_norm": 131.04417419433594,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 55.0814,
+      "step": 6500
+    },
+    {
+      "epoch": 1.4598540145985401,
+      "grad_norm": 1373.429931640625,
+      "learning_rate": 3.722222222222222e-05,
+      "loss": 75.0703,
+      "step": 6600
+    },
+    {
+      "epoch": 1.48197301481973,
+      "grad_norm": 515.3665771484375,
+      "learning_rate": 3.694444444444445e-05,
+      "loss": 91.6801,
+      "step": 6700
+    },
+    {
+      "epoch": 1.50409201504092,
+      "grad_norm": 3621.946044921875,
+      "learning_rate": 3.6666666666666666e-05,
+      "loss": 120.4573,
+      "step": 6800
+    },
+    {
+      "epoch": 1.5262110152621102,
+      "grad_norm": 139.5118408203125,
+      "learning_rate": 3.638888888888889e-05,
+      "loss": 59.9174,
+      "step": 6900
+    },
+    {
+      "epoch": 1.5483300154833002,
+      "grad_norm": 6172.529296875,
+      "learning_rate": 3.611111111111111e-05,
+      "loss": 74.676,
+      "step": 7000
+    },
+    {
+      "epoch": 1.5704490157044901,
+      "grad_norm": 1593.488037109375,
+      "learning_rate": 3.5833333333333335e-05,
+      "loss": 72.2317,
+      "step": 7100
+    },
+    {
+      "epoch": 1.5925680159256803,
+      "grad_norm": 1400.1859130859375,
+      "learning_rate": 3.555555555555556e-05,
+      "loss": 93.2606,
+      "step": 7200
+    },
+    {
+      "epoch": 1.61468701614687,
+      "grad_norm": 687.2742309570312,
+      "learning_rate": 3.527777777777778e-05,
+      "loss": 83.7336,
+      "step": 7300
+    },
+    {
+      "epoch": 1.6368060163680602,
+      "grad_norm": 237.36953735351562,
+      "learning_rate": 3.5e-05,
+      "loss": 68.9703,
+      "step": 7400
+    },
+    {
+      "epoch": 1.6589250165892502,
+      "grad_norm": 2617.44482421875,
+      "learning_rate": 3.472222222222222e-05,
+      "loss": 55.2706,
+      "step": 7500
+    },
+    {
+      "epoch": 1.6810440168104401,
+      "grad_norm": 164.34164428710938,
+      "learning_rate": 3.444444444444445e-05,
+      "loss": 64.0422,
+      "step": 7600
+    },
+    {
+      "epoch": 1.7031630170316303,
+      "grad_norm": 550.0332641601562,
+      "learning_rate": 3.4166666666666666e-05,
+      "loss": 93.087,
+      "step": 7700
+    },
+    {
+      "epoch": 1.72528201725282,
+      "grad_norm": 411.0719909667969,
+      "learning_rate": 3.388888888888889e-05,
+      "loss": 82.8394,
+      "step": 7800
+    },
+    {
+      "epoch": 1.7474010174740102,
+      "grad_norm": 383.0977783203125,
+      "learning_rate": 3.3611111111111116e-05,
+      "loss": 95.9364,
+      "step": 7900
+    },
+    {
+      "epoch": 1.7695200176952002,
+      "grad_norm": 100.02352905273438,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 74.7701,
+      "step": 8000
+    },
+    {
+      "epoch": 1.7916390179163901,
+      "grad_norm": 930.63818359375,
+      "learning_rate": 3.3055555555555553e-05,
+      "loss": 97.197,
+      "step": 8100
+    },
+    {
+      "epoch": 1.8137580181375803,
+      "grad_norm": 201.84938049316406,
+      "learning_rate": 3.277777777777778e-05,
+      "loss": 78.3827,
+      "step": 8200
+    },
+    {
+      "epoch": 1.83587701835877,
+      "grad_norm": 429.5751953125,
+      "learning_rate": 3.2500000000000004e-05,
+      "loss": 87.7946,
+      "step": 8300
+    },
+    {
+      "epoch": 1.8579960185799602,
+      "grad_norm": 839.8375854492188,
+      "learning_rate": 3.222222222222223e-05,
+      "loss": 101.715,
+      "step": 8400
+    },
+    {
+      "epoch": 1.8801150188011502,
+      "grad_norm": 723.384765625,
+      "learning_rate": 3.194444444444444e-05,
+      "loss": 87.1683,
+      "step": 8500
+    },
+    {
+      "epoch": 1.9022340190223401,
+      "grad_norm": 695.84619140625,
+      "learning_rate": 3.1666666666666666e-05,
+      "loss": 53.4962,
+      "step": 8600
+    },
+    {
+      "epoch": 1.9243530192435303,
+      "grad_norm": 447.4176330566406,
+      "learning_rate": 3.138888888888889e-05,
+      "loss": 72.4244,
+      "step": 8700
+    },
+    {
+      "epoch": 1.94647201946472,
+      "grad_norm": 418.347900390625,
+      "learning_rate": 3.111111111111111e-05,
+      "loss": 84.3226,
+      "step": 8800
+    },
+    {
+      "epoch": 1.9685910196859102,
+      "grad_norm": 616.8546752929688,
+      "learning_rate": 3.0833333333333335e-05,
+      "loss": 57.9826,
+      "step": 8900
+    },
+    {
+      "epoch": 1.9907100199071002,
+      "grad_norm": 664.2003784179688,
+      "learning_rate": 3.055555555555556e-05,
+      "loss": 51.4834,
+      "step": 9000
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 142.8988037109375,
+      "eval_runtime": 61.1029,
+      "eval_samples_per_second": 32.895,
+      "eval_steps_per_second": 8.232,
+      "step": 9042
+    },
+    {
+      "epoch": 2.01282902012829,
+      "grad_norm": 418.1151123046875,
+      "learning_rate": 3.0277777777777776e-05,
+      "loss": 73.7381,
+      "step": 9100
+    },
+    {
+      "epoch": 2.0349480203494803,
+      "grad_norm": 404.5308532714844,
+      "learning_rate": 3e-05,
+      "loss": 58.1056,
+      "step": 9200
+    },
+    {
+      "epoch": 2.05706702057067,
+      "grad_norm": 2588.05322265625,
+      "learning_rate": 2.9722222222222223e-05,
+      "loss": 64.0834,
+      "step": 9300
+    },
+    {
+      "epoch": 2.07918602079186,
+      "grad_norm": 145.60726928710938,
+      "learning_rate": 2.9444444444444448e-05,
+      "loss": 62.3793,
+      "step": 9400
+    },
+    {
+      "epoch": 2.1013050210130504,
+      "grad_norm": 950.2636108398438,
+      "learning_rate": 2.916666666666667e-05,
+      "loss": 67.93,
+      "step": 9500
+    },
+    {
+      "epoch": 2.12342402123424,
+      "grad_norm": 557.125244140625,
+      "learning_rate": 2.8888888888888888e-05,
+      "loss": 54.7434,
+      "step": 9600
+    },
+    {
+      "epoch": 2.1455430214554303,
+      "grad_norm": 295.7925109863281,
+      "learning_rate": 2.861111111111111e-05,
+      "loss": 65.4703,
+      "step": 9700
+    },
+    {
+      "epoch": 2.16766202167662,
+      "grad_norm": 193.5789031982422,
+      "learning_rate": 2.8333333333333335e-05,
+      "loss": 58.263,
+      "step": 9800
+    },
+    {
+      "epoch": 2.18978102189781,
+      "grad_norm": 512.2490844726562,
+      "learning_rate": 2.8055555555555557e-05,
+      "loss": 47.4564,
+      "step": 9900
+    },
+    {
+      "epoch": 2.2119000221190004,
+      "grad_norm": 1932.7911376953125,
+      "learning_rate": 2.777777777777778e-05,
+      "loss": 61.474,
+      "step": 10000
+    },
+    {
+      "epoch": 2.23401902234019,
+      "grad_norm": 659.2971801757812,
+      "learning_rate": 2.7500000000000004e-05,
+      "loss": 38.0625,
+      "step": 10100
+    },
+    {
+      "epoch": 2.2561380225613803,
+      "grad_norm": 107.96562194824219,
+      "learning_rate": 2.7222222222222223e-05,
+      "loss": 75.2617,
+      "step": 10200
+    },
+    {
+      "epoch": 2.2782570227825705,
+      "grad_norm": 3581.160888671875,
+      "learning_rate": 2.6944444444444445e-05,
+      "loss": 83.6121,
+      "step": 10300
+    },
+    {
+      "epoch": 2.30037602300376,
+      "grad_norm": 1640.62158203125,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 57.5173,
+      "step": 10400
+    },
+    {
+      "epoch": 2.3224950232249504,
+      "grad_norm": 758.580078125,
+      "learning_rate": 2.6388888888888892e-05,
+      "loss": 66.4868,
+      "step": 10500
+    },
+    {
+      "epoch": 2.34461402344614,
+      "grad_norm": 842.703857421875,
+      "learning_rate": 2.6111111111111114e-05,
+      "loss": 68.3939,
+      "step": 10600
+    },
+    {
+      "epoch": 2.3667330236673303,
+      "grad_norm": 452.1793518066406,
+      "learning_rate": 2.5833333333333336e-05,
+      "loss": 57.7072,
+      "step": 10700
+    },
+    {
+      "epoch": 2.3888520238885205,
+      "grad_norm": 357.23828125,
+      "learning_rate": 2.5555555555555554e-05,
+      "loss": 54.2972,
+      "step": 10800
+    },
+    {
+      "epoch": 2.41097102410971,
+      "grad_norm": 2001.415771484375,
+      "learning_rate": 2.527777777777778e-05,
+      "loss": 73.0164,
+      "step": 10900
+    },
+    {
+      "epoch": 2.4330900243309004,
+      "grad_norm": 2387.650634765625,
+      "learning_rate": 2.5e-05,
+      "loss": 56.2423,
+      "step": 11000
+    },
+    {
+      "epoch": 2.45520902455209,
+      "grad_norm": 1005.8760375976562,
+      "learning_rate": 2.4722222222222223e-05,
+      "loss": 57.5367,
+      "step": 11100
+    },
+    {
+      "epoch": 2.4773280247732803,
+      "grad_norm": 711.6175537109375,
+      "learning_rate": 2.4444444444444445e-05,
+      "loss": 54.2811,
+      "step": 11200
+    },
+    {
+      "epoch": 2.4994470249944705,
+      "grad_norm": 384.83013916015625,
+      "learning_rate": 2.4166666666666667e-05,
+      "loss": 58.3445,
+      "step": 11300
+    },
+    {
+      "epoch": 2.52156602521566,
+      "grad_norm": 280.5820007324219,
+      "learning_rate": 2.3888888888888892e-05,
+      "loss": 52.6407,
+      "step": 11400
+    },
+    {
+      "epoch": 2.5436850254368504,
+      "grad_norm": 3489.7578125,
+      "learning_rate": 2.361111111111111e-05,
+      "loss": 77.9429,
+      "step": 11500
+    },
+    {
+      "epoch": 2.56580402565804,
+      "grad_norm": 2384.225341796875,
+      "learning_rate": 2.3333333333333336e-05,
+      "loss": 60.7436,
+      "step": 11600
+    },
+    {
+      "epoch": 2.5879230258792303,
+      "grad_norm": 287.5812072753906,
+      "learning_rate": 2.3055555555555558e-05,
+      "loss": 42.1971,
+      "step": 11700
+    },
+    {
+      "epoch": 2.6100420261004205,
+      "grad_norm": 1218.7347412109375,
+      "learning_rate": 2.277777777777778e-05,
+      "loss": 46.3975,
+      "step": 11800
+    },
+    {
+      "epoch": 2.63216102632161,
+      "grad_norm": 1416.937255859375,
+      "learning_rate": 2.25e-05,
+      "loss": 59.6613,
+      "step": 11900
+    },
+    {
+      "epoch": 2.6542800265428004,
+      "grad_norm": 1019.8450317382812,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 38.9072,
+      "step": 12000
+    },
+    {
+      "epoch": 2.67639902676399,
+      "grad_norm": 195.94032287597656,
+      "learning_rate": 2.1944444444444445e-05,
+      "loss": 61.2397,
+      "step": 12100
+    },
+    {
+      "epoch": 2.6985180269851803,
+      "grad_norm": 352.8352355957031,
+      "learning_rate": 2.1666666666666667e-05,
+      "loss": 58.8343,
+      "step": 12200
+    },
+    {
+      "epoch": 2.7206370272063705,
+      "grad_norm": 1884.443115234375,
+      "learning_rate": 2.138888888888889e-05,
+      "loss": 63.6136,
+      "step": 12300
+    },
+    {
+      "epoch": 2.74275602742756,
+      "grad_norm": 599.9818725585938,
+      "learning_rate": 2.111111111111111e-05,
+      "loss": 39.0019,
+      "step": 12400
+    },
+    {
+      "epoch": 2.7648750276487504,
+      "grad_norm": 887.4273071289062,
+      "learning_rate": 2.0833333333333336e-05,
+      "loss": 58.0432,
+      "step": 12500
+    },
+    {
+      "epoch": 2.78699402786994,
+      "grad_norm": 363.2604675292969,
+      "learning_rate": 2.0555555555555555e-05,
+      "loss": 67.5791,
+      "step": 12600
+    },
+    {
+      "epoch": 2.8091130280911303,
+      "grad_norm": 199.94940185546875,
+      "learning_rate": 2.027777777777778e-05,
+      "loss": 56.6002,
+      "step": 12700
+    },
+    {
+      "epoch": 2.8312320283123205,
+      "grad_norm": 945.8538208007812,
+      "learning_rate": 2e-05,
+      "loss": 43.1726,
+      "step": 12800
+    },
+    {
+      "epoch": 2.85335102853351,
+      "grad_norm": 1024.704345703125,
+      "learning_rate": 1.9722222222222224e-05,
+      "loss": 38.9053,
+      "step": 12900
+    },
+    {
+      "epoch": 2.8754700287547004,
+      "grad_norm": 1009.4866943359375,
+      "learning_rate": 1.9444444444444445e-05,
+      "loss": 58.8109,
+      "step": 13000
+    },
+    {
+      "epoch": 2.89758902897589,
+      "grad_norm": 537.6126098632812,
+      "learning_rate": 1.9166666666666667e-05,
+      "loss": 73.4815,
+      "step": 13100
+    },
+    {
+      "epoch": 2.9197080291970803,
+      "grad_norm": 269.6388244628906,
+      "learning_rate": 1.888888888888889e-05,
+      "loss": 55.6755,
+      "step": 13200
+    },
+    {
+      "epoch": 2.9418270294182705,
+      "grad_norm": 1045.892578125,
+      "learning_rate": 1.861111111111111e-05,
+      "loss": 52.2818,
+      "step": 13300
+    },
+    {
+      "epoch": 2.96394602963946,
+      "grad_norm": 2102.285400390625,
+      "learning_rate": 1.8333333333333333e-05,
+      "loss": 47.7583,
+      "step": 13400
+    },
+    {
+      "epoch": 2.9860650298606504,
+      "grad_norm": 565.085693359375,
+      "learning_rate": 1.8055555555555555e-05,
+      "loss": 49.5788,
+      "step": 13500
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 122.14683532714844,
+      "eval_runtime": 61.9195,
+      "eval_samples_per_second": 32.462,
+      "eval_steps_per_second": 8.123,
+      "step": 13563
+    },
+    {
+      "epoch": 3.00818403008184,
+      "grad_norm": 1329.616455078125,
+      "learning_rate": 1.777777777777778e-05,
+      "loss": 35.1822,
+      "step": 13600
+    },
+    {
+      "epoch": 3.0303030303030303,
+      "grad_norm": 206.73377990722656,
+      "learning_rate": 1.75e-05,
+      "loss": 45.8486,
+      "step": 13700
+    },
+    {
+      "epoch": 3.0524220305242205,
+      "grad_norm": 171.621826171875,
+      "learning_rate": 1.7222222222222224e-05,
+      "loss": 51.2093,
+      "step": 13800
+    },
+    {
+      "epoch": 3.07454103074541,
+      "grad_norm": 707.8844604492188,
+      "learning_rate": 1.6944444444444446e-05,
+      "loss": 30.8107,
+      "step": 13900
+    },
+    {
+      "epoch": 3.0966600309666004,
+      "grad_norm": 454.2152099609375,
+      "learning_rate": 1.6666666666666667e-05,
+      "loss": 44.615,
+      "step": 14000
+    },
+    {
+      "epoch": 3.11877903118779,
+      "grad_norm": 527.0735473632812,
+      "learning_rate": 1.638888888888889e-05,
+      "loss": 68.0012,
+      "step": 14100
+    },
+    {
+      "epoch": 3.1408980314089803,
+      "grad_norm": 378.765869140625,
+      "learning_rate": 1.6111111111111115e-05,
+      "loss": 37.9635,
+      "step": 14200
+    },
+    {
+      "epoch": 3.1630170316301705,
+      "grad_norm": 637.0385131835938,
+      "learning_rate": 1.5833333333333333e-05,
+      "loss": 47.3296,
+      "step": 14300
+    },
+    {
+      "epoch": 3.18513603185136,
+      "grad_norm": 409.57025146484375,
+      "learning_rate": 1.5555555555555555e-05,
+      "loss": 40.9838,
+      "step": 14400
+    },
+    {
+      "epoch": 3.2072550320725504,
+      "grad_norm": 809.7942504882812,
+      "learning_rate": 1.527777777777778e-05,
+      "loss": 46.4379,
+      "step": 14500
+    },
+    {
+      "epoch": 3.22937403229374,
+      "grad_norm": 23.185964584350586,
+      "learning_rate": 1.5e-05,
+      "loss": 31.5362,
+      "step": 14600
+    },
+    {
+      "epoch": 3.2514930325149303,
+      "grad_norm": 1714.423095703125,
+      "learning_rate": 1.4722222222222224e-05,
+      "loss": 52.3923,
+      "step": 14700
+    },
+    {
+      "epoch": 3.2736120327361204,
+      "grad_norm": 395.1560363769531,
+      "learning_rate": 1.4444444444444444e-05,
+      "loss": 49.203,
+      "step": 14800
+    },
+    {
+      "epoch": 3.29573103295731,
+      "grad_norm": 376.0379638671875,
+      "learning_rate": 1.4166666666666668e-05,
+      "loss": 30.0454,
+      "step": 14900
+    },
+    {
+      "epoch": 3.3178500331785004,
+      "grad_norm": 815.3323364257812,
+      "learning_rate": 1.388888888888889e-05,
+      "loss": 43.2695,
+      "step": 15000
+    },
+    {
+      "epoch": 3.33996903339969,
+      "grad_norm": 332.2477722167969,
+      "learning_rate": 1.3611111111111111e-05,
+      "loss": 39.2527,
+      "step": 15100
+    },
+    {
+      "epoch": 3.3620880336208803,
+      "grad_norm": 983.4076538085938,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 61.8092,
+      "step": 15200
+    },
+    {
+      "epoch": 3.3842070338420704,
+      "grad_norm": 321.3147888183594,
+      "learning_rate": 1.3055555555555557e-05,
+      "loss": 57.5652,
+      "step": 15300
+    },
+    {
+      "epoch": 3.40632603406326,
+      "grad_norm": 477.24224853515625,
+      "learning_rate": 1.2777777777777777e-05,
+      "loss": 53.3966,
+      "step": 15400
+    },
+    {
+      "epoch": 3.4284450342844504,
+      "grad_norm": 1399.400146484375,
+      "learning_rate": 1.25e-05,
+      "loss": 65.4909,
+      "step": 15500
+    },
+    {
+      "epoch": 3.4505640345056405,
+      "grad_norm": 441.70849609375,
+      "learning_rate": 1.2222222222222222e-05,
+      "loss": 56.8207,
+      "step": 15600
+    },
+    {
+      "epoch": 3.4726830347268303,
+      "grad_norm": 373.0791931152344,
+      "learning_rate": 1.1944444444444446e-05,
+      "loss": 54.7861,
+      "step": 15700
+    },
+    {
+      "epoch": 3.4948020349480204,
+      "grad_norm": 855.823974609375,
+      "learning_rate": 1.1666666666666668e-05,
+      "loss": 43.0153,
+      "step": 15800
+    },
+    {
+      "epoch": 3.5169210351692106,
+      "grad_norm": 1049.0374755859375,
+      "learning_rate": 1.138888888888889e-05,
+      "loss": 55.1925,
+      "step": 15900
+    },
+    {
+      "epoch": 3.5390400353904004,
+      "grad_norm": 1020.545654296875,
+      "learning_rate": 1.1111111111111112e-05,
+      "loss": 35.9507,
+      "step": 16000
+    },
+    {
+      "epoch": 3.56115903561159,
+      "grad_norm": 1310.59033203125,
+      "learning_rate": 1.0833333333333334e-05,
+      "loss": 40.9387,
+      "step": 16100
+    },
+    {
+      "epoch": 3.5832780358327803,
+      "grad_norm": 212.0251007080078,
+      "learning_rate": 1.0555555555555555e-05,
+      "loss": 37.7763,
+      "step": 16200
+    },
+    {
+      "epoch": 3.6053970360539704,
+      "grad_norm": 360.9208984375,
+      "learning_rate": 1.0277777777777777e-05,
+      "loss": 34.6411,
+      "step": 16300
+    },
+    {
+      "epoch": 3.6275160362751606,
+      "grad_norm": 973.3055419921875,
+      "learning_rate": 1e-05,
+      "loss": 35.7516,
+      "step": 16400
+    },
+    {
+      "epoch": 3.6496350364963503,
+      "grad_norm": 2761.760498046875,
+      "learning_rate": 9.722222222222223e-06,
+      "loss": 43.8869,
+      "step": 16500
+    },
+    {
+      "epoch": 3.6717540367175405,
+      "grad_norm": 905.3215942382812,
+      "learning_rate": 9.444444444444445e-06,
+      "loss": 25.8201,
+      "step": 16600
+    },
+    {
+      "epoch": 3.6938730369387303,
+      "grad_norm": 252.27920532226562,
+      "learning_rate": 9.166666666666666e-06,
+      "loss": 60.4341,
+      "step": 16700
+    },
+    {
+      "epoch": 3.7159920371599204,
+      "grad_norm": 343.59344482421875,
+      "learning_rate": 8.88888888888889e-06,
+      "loss": 36.6514,
+      "step": 16800
+    },
+    {
+      "epoch": 3.7381110373811106,
+      "grad_norm": 922.6008911132812,
+      "learning_rate": 8.611111111111112e-06,
+      "loss": 47.1865,
+      "step": 16900
+    },
+    {
+      "epoch": 3.7602300376023003,
+      "grad_norm": 68.8092269897461,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 48.1257,
+      "step": 17000
+    },
+    {
+      "epoch": 3.7823490378234905,
+      "grad_norm": 785.6138305664062,
+      "learning_rate": 8.055555555555557e-06,
+      "loss": 56.9577,
+      "step": 17100
+    },
+    {
+      "epoch": 3.8044680380446803,
+      "grad_norm": 746.76416015625,
+      "learning_rate": 7.777777777777777e-06,
+      "loss": 39.9381,
+      "step": 17200
+    },
+    {
+      "epoch": 3.8265870382658704,
+      "grad_norm": 362.0380554199219,
+      "learning_rate": 7.5e-06,
+      "loss": 34.4047,
+      "step": 17300
+    },
+    {
+      "epoch": 3.8487060384870606,
+      "grad_norm": 180.6659698486328,
+      "learning_rate": 7.222222222222222e-06,
+      "loss": 31.7799,
+      "step": 17400
+    },
+    {
+      "epoch": 3.8708250387082503,
+      "grad_norm": 298.1046447753906,
+      "learning_rate": 6.944444444444445e-06,
+      "loss": 47.4674,
+      "step": 17500
+    },
+    {
+      "epoch": 3.8929440389294405,
+      "grad_norm": 1920.2069091796875,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 49.4887,
+      "step": 17600
+    },
+    {
+      "epoch": 3.9150630391506303,
+      "grad_norm": 2852.952392578125,
+      "learning_rate": 6.3888888888888885e-06,
+      "loss": 48.4204,
+      "step": 17700
+    },
+    {
+      "epoch": 3.9371820393718204,
+      "grad_norm": 496.2744445800781,
+      "learning_rate": 6.111111111111111e-06,
+      "loss": 37.0487,
+      "step": 17800
+    },
+    {
+      "epoch": 3.9593010395930106,
+      "grad_norm": 812.6398315429688,
+      "learning_rate": 5.833333333333334e-06,
+      "loss": 52.909,
+      "step": 17900
+    },
+    {
+      "epoch": 3.9814200398142003,
+      "grad_norm": 705.7808837890625,
+      "learning_rate": 5.555555555555556e-06,
+      "loss": 23.5713,
+      "step": 18000
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 119.30766296386719,
+      "eval_runtime": 61.5463,
+      "eval_samples_per_second": 32.658,
+      "eval_steps_per_second": 8.173,
+      "step": 18084
+    },
+    {
+      "epoch": 4.00353904003539,
+      "grad_norm": 192.50172424316406,
+      "learning_rate": 5.277777777777778e-06,
+      "loss": 43.4573,
+      "step": 18100
+    },
+    {
+      "epoch": 4.02565804025658,
+      "grad_norm": 657.6646118164062,
+      "learning_rate": 5e-06,
+      "loss": 33.3628,
+      "step": 18200
+    },
+    {
+      "epoch": 4.04777704047777,
+      "grad_norm": 1364.3040771484375,
+      "learning_rate": 4.722222222222222e-06,
+      "loss": 46.6342,
+      "step": 18300
+    },
+    {
+      "epoch": 4.069896040698961,
+      "grad_norm": 553.6497802734375,
+      "learning_rate": 4.444444444444445e-06,
+      "loss": 47.8158,
+      "step": 18400
+    },
+    {
+      "epoch": 4.092015040920151,
+      "grad_norm": 86.2293472290039,
+      "learning_rate": 4.166666666666667e-06,
+      "loss": 50.5882,
+      "step": 18500
+    },
+    {
+      "epoch": 4.11413404114134,
+      "grad_norm": 860.7024536132812,
+      "learning_rate": 3.888888888888889e-06,
+      "loss": 35.6031,
+      "step": 18600
+    },
+    {
+      "epoch": 4.13625304136253,
+      "grad_norm": 646.49267578125,
+      "learning_rate": 3.611111111111111e-06,
+      "loss": 26.6029,
+      "step": 18700
+    },
+    {
+      "epoch": 4.15837204158372,
+      "grad_norm": 586.1218872070312,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": 42.9749,
+      "step": 18800
+    },
+    {
+      "epoch": 4.180491041804911,
+      "grad_norm": 973.4674072265625,
+      "learning_rate": 3.0555555555555556e-06,
+      "loss": 37.9648,
+      "step": 18900
+    },
+    {
+      "epoch": 4.202610042026101,
+      "grad_norm": 694.5072631835938,
+      "learning_rate": 2.777777777777778e-06,
+      "loss": 40.874,
+      "step": 19000
+    },
+    {
+      "epoch": 4.22472904224729,
+      "grad_norm": 960.8961791992188,
+      "learning_rate": 2.5e-06,
+      "loss": 38.9788,
+      "step": 19100
+    },
+    {
+      "epoch": 4.24684804246848,
+      "grad_norm": 494.7688293457031,
+      "learning_rate": 2.2222222222222225e-06,
+      "loss": 41.4962,
+      "step": 19200
+    },
+    {
+      "epoch": 4.26896704268967,
+      "grad_norm": 1084.4683837890625,
+      "learning_rate": 1.9444444444444444e-06,
+      "loss": 37.17,
+      "step": 19300
+    },
+    {
+      "epoch": 4.291086042910861,
+      "grad_norm": 961.61572265625,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 27.5056,
+      "step": 19400
+    },
+    {
+      "epoch": 4.313205043132051,
+      "grad_norm": 596.8368530273438,
+      "learning_rate": 1.388888888888889e-06,
+      "loss": 36.0704,
+      "step": 19500
+    },
+    {
+      "epoch": 4.33532404335324,
+      "grad_norm": 202.31369018554688,
+      "learning_rate": 1.1111111111111112e-06,
+      "loss": 37.608,
+      "step": 19600
+    },
+    {
+      "epoch": 4.35744304357443,
+      "grad_norm": 461.15460205078125,
+      "learning_rate": 8.333333333333333e-07,
+      "loss": 31.0886,
+      "step": 19700
+    },
+    {
+      "epoch": 4.37956204379562,
+      "grad_norm": 995.475830078125,
+      "learning_rate": 5.555555555555556e-07,
+      "loss": 31.7115,
+      "step": 19800
+    },
+    {
+      "epoch": 4.401681044016811,
+      "grad_norm": 435.1724548339844,
+      "learning_rate": 2.777777777777778e-07,
+      "loss": 39.3167,
+      "step": 19900
+    },
+    {
+      "epoch": 4.423800044238001,
+      "grad_norm": 1266.6600341796875,
+      "learning_rate": 0.0,
+      "loss": 39.4665,
+      "step": 20000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 20000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}