Upload 12 files

Browse files

Files changed (12) hide show

README.md +54 -3
added_tokens.json +3 -0
all_results.json +9 -0
config.json +28 -0
model.safetensors +3 -0
special_tokens_map.json +15 -0
spm.model +3 -0
tokenizer.json +0 -0
tokenizer_config.json +58 -0
train_results.json +9 -0
trainer_state.json +1169 -0
training_args.bin +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,54 @@
----
-license: mit
----

+---
+tags:
+- generated_from_trainer
+model-index:
+- name: DeB3RTa_3_xsmall
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# DeB3RTa_3_xsmall
+This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0001
+- train_batch_size: 192
+- eval_batch_size: 8
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 1536
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-06
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_ratio: 0.01
+- num_epochs: 50.0
+- mixed_precision_training: Native AMP
+### Training results
+### Framework versions
+- Transformers 4.42.4
+- Pytorch 2.3.1+cu121
+- Datasets 2.20.0
+- Tokenizers 0.19.1

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "[MASK]": 128000
+}

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 50.0,
+    "total_flos": 2.052104150815488e+18,
+    "train_loss": 0.04098836247254364,
+    "train_runtime": 10357.3823,
+    "train_samples": 2477405,
+    "train_samples_per_second": 11959.61,
+    "train_steps_per_second": 7.787
+}

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "architectures": [
+    "DebertaV2ForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-07,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "num_attention_heads": 6,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 384,
+  "pos_att_type": null,
+  "position_biased_input": true,
+  "relative_attention": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.4",
+  "type_vocab_size": 0,
+  "vocab_size": 128100
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdea08c94f1b19046bb4f3a97813eeda9a7ca770e5c04724bf7579f59bd4dd7a
+size 283856744

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "[CLS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5598d5e96f339a8d980c15f9afd405a2e5e1be7db41de3ed13b0f03fac1e8c17
+size 2447305

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128000": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "split_by_punct": false,
+  "tokenizer_class": "DebertaV2Tokenizer",
+  "unk_token": "[UNK]",
+  "vocab_type": "spm"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 50.0,
+    "total_flos": 2.052104150815488e+18,
+    "train_loss": 0.04098836247254364,
+    "train_runtime": 10357.3823,
+    "train_samples": 2477405,
+    "train_samples_per_second": 11959.61,
+    "train_steps_per_second": 7.787
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1169 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 50.0,
+  "eval_steps": 500,
+  "global_step": 80650,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.30998140111593303,
+      "grad_norm": 0.3543250262737274,
+      "learning_rate": 6.195786864931847e-05,
+      "loss": 9.0345,
+      "step": 500
+    },
+    {
+      "epoch": 0.6199628022318661,
+      "grad_norm": 0.5106557607650757,
+      "learning_rate": 9.97582756158962e-05,
+      "loss": 6.2184,
+      "step": 1000
+    },
+    {
+      "epoch": 0.9299442033477991,
+      "grad_norm": 2.6617751121520996,
+      "learning_rate": 9.913204664153402e-05,
+      "loss": 5.4194,
+      "step": 1500
+    },
+    {
+      "epoch": 1.2399256044637321,
+      "grad_norm": 1.8096632957458496,
+      "learning_rate": 9.850581766717182e-05,
+      "loss": 3.915,
+      "step": 2000
+    },
+    {
+      "epoch": 1.5499070055796653,
+      "grad_norm": 1.2520173788070679,
+      "learning_rate": 9.787958869280964e-05,
+      "loss": 2.7963,
+      "step": 2500
+    },
+    {
+      "epoch": 1.8598884066955983,
+      "grad_norm": 0.8099603056907654,
+      "learning_rate": 9.725335971844745e-05,
+      "loss": 2.2568,
+      "step": 3000
+    },
+    {
+      "epoch": 2.1698698078115313,
+      "grad_norm": 0.7233591079711914,
+      "learning_rate": 9.662713074408527e-05,
+      "loss": 1.9847,
+      "step": 3500
+    },
+    {
+      "epoch": 2.4798512089274642,
+      "grad_norm": 0.6427165865898132,
+      "learning_rate": 9.600090176972308e-05,
+      "loss": 1.8216,
+      "step": 4000
+    },
+    {
+      "epoch": 2.7898326100433977,
+      "grad_norm": 0.6729193925857544,
+      "learning_rate": 9.53746727953609e-05,
+      "loss": 1.7067,
+      "step": 4500
+    },
+    {
+      "epoch": 3.0998140111593306,
+      "grad_norm": 0.6484789848327637,
+      "learning_rate": 9.47484438209987e-05,
+      "loss": 1.6187,
+      "step": 5000
+    },
+    {
+      "epoch": 3.4097954122752636,
+      "grad_norm": 0.5950448513031006,
+      "learning_rate": 9.412221484663653e-05,
+      "loss": 1.5479,
+      "step": 5500
+    },
+    {
+      "epoch": 3.7197768133911966,
+      "grad_norm": 0.6102598309516907,
+      "learning_rate": 9.349598587227433e-05,
+      "loss": 1.4879,
+      "step": 6000
+    },
+    {
+      "epoch": 4.02975821450713,
+      "grad_norm": 0.6204754710197449,
+      "learning_rate": 9.286975689791215e-05,
+      "loss": 1.4379,
+      "step": 6500
+    },
+    {
+      "epoch": 4.3397396156230625,
+      "grad_norm": 0.590217649936676,
+      "learning_rate": 9.224352792354997e-05,
+      "loss": 1.3926,
+      "step": 7000
+    },
+    {
+      "epoch": 4.6497210167389955,
+      "grad_norm": 0.6062743663787842,
+      "learning_rate": 9.161729894918779e-05,
+      "loss": 1.3553,
+      "step": 7500
+    },
+    {
+      "epoch": 4.9597024178549285,
+      "grad_norm": 0.5663708448410034,
+      "learning_rate": 9.09910699748256e-05,
+      "loss": 1.3201,
+      "step": 8000
+    },
+    {
+      "epoch": 5.2696838189708615,
+      "grad_norm": 0.5806947350502014,
+      "learning_rate": 9.036484100046342e-05,
+      "loss": 1.2904,
+      "step": 8500
+    },
+    {
+      "epoch": 5.579665220086794,
+      "grad_norm": 0.6131803393363953,
+      "learning_rate": 8.973861202610123e-05,
+      "loss": 1.2623,
+      "step": 9000
+    },
+    {
+      "epoch": 5.889646621202727,
+      "grad_norm": 0.5666236281394958,
+      "learning_rate": 8.911238305173905e-05,
+      "loss": 1.2368,
+      "step": 9500
+    },
+    {
+      "epoch": 6.199628022318661,
+      "grad_norm": 0.6078547239303589,
+      "learning_rate": 8.848615407737685e-05,
+      "loss": 1.212,
+      "step": 10000
+    },
+    {
+      "epoch": 6.509609423434594,
+      "grad_norm": 0.575513482093811,
+      "learning_rate": 8.785992510301467e-05,
+      "loss": 1.1914,
+      "step": 10500
+    },
+    {
+      "epoch": 6.819590824550527,
+      "grad_norm": 0.5826976895332336,
+      "learning_rate": 8.723369612865248e-05,
+      "loss": 1.1718,
+      "step": 11000
+    },
+    {
+      "epoch": 7.12957222566646,
+      "grad_norm": 0.544598400592804,
+      "learning_rate": 8.66074671542903e-05,
+      "loss": 1.1548,
+      "step": 11500
+    },
+    {
+      "epoch": 7.439553626782393,
+      "grad_norm": 0.5824791193008423,
+      "learning_rate": 8.598123817992811e-05,
+      "loss": 1.1363,
+      "step": 12000
+    },
+    {
+      "epoch": 7.749535027898326,
+      "grad_norm": 0.5747692584991455,
+      "learning_rate": 8.535500920556593e-05,
+      "loss": 1.1211,
+      "step": 12500
+    },
+    {
+      "epoch": 8.05951642901426,
+      "grad_norm": 0.5473280549049377,
+      "learning_rate": 8.472878023120375e-05,
+      "loss": 1.1077,
+      "step": 13000
+    },
+    {
+      "epoch": 8.369497830130193,
+      "grad_norm": 0.5574379563331604,
+      "learning_rate": 8.410255125684155e-05,
+      "loss": 1.0908,
+      "step": 13500
+    },
+    {
+      "epoch": 8.679479231246125,
+      "grad_norm": 0.5424452424049377,
+      "learning_rate": 8.347632228247937e-05,
+      "loss": 1.0785,
+      "step": 14000
+    },
+    {
+      "epoch": 8.989460632362059,
+      "grad_norm": 0.5508283376693726,
+      "learning_rate": 8.285009330811718e-05,
+      "loss": 1.0683,
+      "step": 14500
+    },
+    {
+      "epoch": 9.299442033477991,
+      "grad_norm": 0.5519115924835205,
+      "learning_rate": 8.2223864333755e-05,
+      "loss": 1.0537,
+      "step": 15000
+    },
+    {
+      "epoch": 9.609423434593925,
+      "grad_norm": 0.5510475039482117,
+      "learning_rate": 8.159763535939281e-05,
+      "loss": 1.0443,
+      "step": 15500
+    },
+    {
+      "epoch": 9.919404835709857,
+      "grad_norm": 0.5631123185157776,
+      "learning_rate": 8.097140638503063e-05,
+      "loss": 1.0339,
+      "step": 16000
+    },
+    {
+      "epoch": 10.22938623682579,
+      "grad_norm": 0.5705382823944092,
+      "learning_rate": 8.034517741066844e-05,
+      "loss": 1.0217,
+      "step": 16500
+    },
+    {
+      "epoch": 10.539367637941723,
+      "grad_norm": 0.5316577553749084,
+      "learning_rate": 7.971894843630626e-05,
+      "loss": 1.0151,
+      "step": 17000
+    },
+    {
+      "epoch": 10.849349039057657,
+      "grad_norm": 0.5557442307472229,
+      "learning_rate": 7.909271946194406e-05,
+      "loss": 1.0043,
+      "step": 17500
+    },
+    {
+      "epoch": 11.159330440173589,
+      "grad_norm": 0.5498985648155212,
+      "learning_rate": 7.846649048758188e-05,
+      "loss": 0.9951,
+      "step": 18000
+    },
+    {
+      "epoch": 11.469311841289523,
+      "grad_norm": 0.552780032157898,
+      "learning_rate": 7.784026151321969e-05,
+      "loss": 0.9855,
+      "step": 18500
+    },
+    {
+      "epoch": 11.779293242405455,
+      "grad_norm": 0.5406888127326965,
+      "learning_rate": 7.721403253885752e-05,
+      "loss": 0.9795,
+      "step": 19000
+    },
+    {
+      "epoch": 12.089274643521389,
+      "grad_norm": 0.537375271320343,
+      "learning_rate": 7.658780356449533e-05,
+      "loss": 0.971,
+      "step": 19500
+    },
+    {
+      "epoch": 12.399256044637323,
+      "grad_norm": 0.5666614174842834,
+      "learning_rate": 7.596157459013315e-05,
+      "loss": 0.9643,
+      "step": 20000
+    },
+    {
+      "epoch": 12.709237445753255,
+      "grad_norm": 0.5302731990814209,
+      "learning_rate": 7.533659807371968e-05,
+      "loss": 0.9582,
+      "step": 20500
+    },
+    {
+      "epoch": 13.019218846869189,
+      "grad_norm": 0.5608243346214294,
+      "learning_rate": 7.471036909935749e-05,
+      "loss": 0.9512,
+      "step": 21000
+    },
+    {
+      "epoch": 13.32920024798512,
+      "grad_norm": 0.5309119820594788,
+      "learning_rate": 7.408414012499531e-05,
+      "loss": 0.9424,
+      "step": 21500
+    },
+    {
+      "epoch": 13.639181649101054,
+      "grad_norm": 0.5380939245223999,
+      "learning_rate": 7.345791115063312e-05,
+      "loss": 0.9383,
+      "step": 22000
+    },
+    {
+      "epoch": 13.949163050216987,
+      "grad_norm": 0.5440984964370728,
+      "learning_rate": 7.283168217627094e-05,
+      "loss": 0.9298,
+      "step": 22500
+    },
+    {
+      "epoch": 14.25914445133292,
+      "grad_norm": 0.5377441048622131,
+      "learning_rate": 7.220545320190874e-05,
+      "loss": 0.9245,
+      "step": 23000
+    },
+    {
+      "epoch": 14.569125852448852,
+      "grad_norm": 0.5402495265007019,
+      "learning_rate": 7.157922422754656e-05,
+      "loss": 0.9196,
+      "step": 23500
+    },
+    {
+      "epoch": 14.879107253564786,
+      "grad_norm": 0.5610705018043518,
+      "learning_rate": 7.095299525318437e-05,
+      "loss": 0.9146,
+      "step": 24000
+    },
+    {
+      "epoch": 15.189088654680718,
+      "grad_norm": 0.5305636525154114,
+      "learning_rate": 7.032676627882219e-05,
+      "loss": 0.9071,
+      "step": 24500
+    },
+    {
+      "epoch": 15.499070055796652,
+      "grad_norm": 0.5398979187011719,
+      "learning_rate": 6.970053730446e-05,
+      "loss": 0.9037,
+      "step": 25000
+    },
+    {
+      "epoch": 15.809051456912584,
+      "grad_norm": 0.5490283370018005,
+      "learning_rate": 6.907556078804655e-05,
+      "loss": 0.8982,
+      "step": 25500
+    },
+    {
+      "epoch": 16.11903285802852,
+      "grad_norm": 0.5505014061927795,
+      "learning_rate": 6.844933181368435e-05,
+      "loss": 0.8933,
+      "step": 26000
+    },
+    {
+      "epoch": 16.429014259144452,
+      "grad_norm": 0.5260488390922546,
+      "learning_rate": 6.782310283932217e-05,
+      "loss": 0.8865,
+      "step": 26500
+    },
+    {
+      "epoch": 16.738995660260386,
+      "grad_norm": 0.5459970235824585,
+      "learning_rate": 6.719687386495999e-05,
+      "loss": 0.8837,
+      "step": 27000
+    },
+    {
+      "epoch": 17.048977061376316,
+      "grad_norm": 0.5260828733444214,
+      "learning_rate": 6.657189734854653e-05,
+      "loss": 0.8812,
+      "step": 27500
+    },
+    {
+      "epoch": 17.35895846249225,
+      "grad_norm": 0.531878650188446,
+      "learning_rate": 6.594566837418435e-05,
+      "loss": 0.874,
+      "step": 28000
+    },
+    {
+      "epoch": 17.668939863608184,
+      "grad_norm": 0.5373751521110535,
+      "learning_rate": 6.531943939982215e-05,
+      "loss": 0.8703,
+      "step": 28500
+    },
+    {
+      "epoch": 17.978921264724118,
+      "grad_norm": 0.5685413479804993,
+      "learning_rate": 6.469321042545997e-05,
+      "loss": 0.8674,
+      "step": 29000
+    },
+    {
+      "epoch": 18.288902665840048,
+      "grad_norm": 0.5405117273330688,
+      "learning_rate": 6.406698145109778e-05,
+      "loss": 0.8618,
+      "step": 29500
+    },
+    {
+      "epoch": 18.598884066955982,
+      "grad_norm": 0.5303318500518799,
+      "learning_rate": 6.344325739263305e-05,
+      "loss": 0.8572,
+      "step": 30000
+    },
+    {
+      "epoch": 18.908865468071916,
+      "grad_norm": 0.5173208117485046,
+      "learning_rate": 6.281702841827086e-05,
+      "loss": 0.8552,
+      "step": 30500
+    },
+    {
+      "epoch": 19.21884686918785,
+      "grad_norm": 0.5334449410438538,
+      "learning_rate": 6.219079944390868e-05,
+      "loss": 0.8494,
+      "step": 31000
+    },
+    {
+      "epoch": 19.52882827030378,
+      "grad_norm": 0.5522080659866333,
+      "learning_rate": 6.156457046954649e-05,
+      "loss": 0.8464,
+      "step": 31500
+    },
+    {
+      "epoch": 19.838809671419714,
+      "grad_norm": 0.5295758247375488,
+      "learning_rate": 6.09383414951843e-05,
+      "loss": 0.845,
+      "step": 32000
+    },
+    {
+      "epoch": 20.148791072535648,
+      "grad_norm": 0.5164583325386047,
+      "learning_rate": 6.0312112520822115e-05,
+      "loss": 0.8395,
+      "step": 32500
+    },
+    {
+      "epoch": 20.45877247365158,
+      "grad_norm": 0.5620171427726746,
+      "learning_rate": 5.968713600440865e-05,
+      "loss": 0.8354,
+      "step": 33000
+    },
+    {
+      "epoch": 20.768753874767516,
+      "grad_norm": 0.5254458785057068,
+      "learning_rate": 5.906090703004646e-05,
+      "loss": 0.8336,
+      "step": 33500
+    },
+    {
+      "epoch": 21.078735275883446,
+      "grad_norm": 0.5437597632408142,
+      "learning_rate": 5.8434678055684276e-05,
+      "loss": 0.8304,
+      "step": 34000
+    },
+    {
+      "epoch": 21.38871667699938,
+      "grad_norm": 0.5438856482505798,
+      "learning_rate": 5.78084490813221e-05,
+      "loss": 0.8263,
+      "step": 34500
+    },
+    {
+      "epoch": 21.698698078115314,
+      "grad_norm": 0.5386750102043152,
+      "learning_rate": 5.7182220106959916e-05,
+      "loss": 0.8248,
+      "step": 35000
+    },
+    {
+      "epoch": 22.008679479231247,
+      "grad_norm": 0.5307642817497253,
+      "learning_rate": 5.655724359054645e-05,
+      "loss": 0.8223,
+      "step": 35500
+    },
+    {
+      "epoch": 22.318660880347178,
+      "grad_norm": 0.5404214859008789,
+      "learning_rate": 5.5931014616184264e-05,
+      "loss": 0.8176,
+      "step": 36000
+    },
+    {
+      "epoch": 22.62864228146311,
+      "grad_norm": 0.555665910243988,
+      "learning_rate": 5.530478564182208e-05,
+      "loss": 0.8164,
+      "step": 36500
+    },
+    {
+      "epoch": 22.938623682579045,
+      "grad_norm": 0.5331476330757141,
+      "learning_rate": 5.467855666745989e-05,
+      "loss": 0.8135,
+      "step": 37000
+    },
+    {
+      "epoch": 23.24860508369498,
+      "grad_norm": 0.541491687297821,
+      "learning_rate": 5.405358015104644e-05,
+      "loss": 0.8097,
+      "step": 37500
+    },
+    {
+      "epoch": 23.55858648481091,
+      "grad_norm": 0.5554507374763489,
+      "learning_rate": 5.342735117668425e-05,
+      "loss": 0.8074,
+      "step": 38000
+    },
+    {
+      "epoch": 23.868567885926844,
+      "grad_norm": 0.5485785007476807,
+      "learning_rate": 5.2801122202322065e-05,
+      "loss": 0.8054,
+      "step": 38500
+    },
+    {
+      "epoch": 24.178549287042777,
+      "grad_norm": 0.5320767164230347,
+      "learning_rate": 5.217489322795988e-05,
+      "loss": 0.8018,
+      "step": 39000
+    },
+    {
+      "epoch": 24.48853068815871,
+      "grad_norm": 0.5248667001724243,
+      "learning_rate": 5.154866425359769e-05,
+      "loss": 0.8008,
+      "step": 39500
+    },
+    {
+      "epoch": 24.798512089274645,
+      "grad_norm": 0.5368346571922302,
+      "learning_rate": 5.0922435279235505e-05,
+      "loss": 0.7975,
+      "step": 40000
+    },
+    {
+      "epoch": 25.108493490390575,
+      "grad_norm": 0.53144371509552,
+      "learning_rate": 5.029620630487332e-05,
+      "loss": 0.7947,
+      "step": 40500
+    },
+    {
+      "epoch": 25.41847489150651,
+      "grad_norm": 0.5482547879219055,
+      "learning_rate": 4.966997733051113e-05,
+      "loss": 0.793,
+      "step": 41000
+    },
+    {
+      "epoch": 25.728456292622443,
+      "grad_norm": 0.5446964502334595,
+      "learning_rate": 4.9043748356148946e-05,
+      "loss": 0.7905,
+      "step": 41500
+    },
+    {
+      "epoch": 26.038437693738377,
+      "grad_norm": 0.5257270932197571,
+      "learning_rate": 4.841751938178676e-05,
+      "loss": 0.7892,
+      "step": 42000
+    },
+    {
+      "epoch": 26.348419094854307,
+      "grad_norm": 0.5478941202163696,
+      "learning_rate": 4.779129040742457e-05,
+      "loss": 0.7856,
+      "step": 42500
+    },
+    {
+      "epoch": 26.65840049597024,
+      "grad_norm": 0.5381990671157837,
+      "learning_rate": 4.7165061433062386e-05,
+      "loss": 0.7863,
+      "step": 43000
+    },
+    {
+      "epoch": 26.968381897086175,
+      "grad_norm": 0.546461820602417,
+      "learning_rate": 4.65388324587002e-05,
+      "loss": 0.7826,
+      "step": 43500
+    },
+    {
+      "epoch": 27.27836329820211,
+      "grad_norm": 0.543404757976532,
+      "learning_rate": 4.591260348433802e-05,
+      "loss": 0.7796,
+      "step": 44000
+    },
+    {
+      "epoch": 27.58834469931804,
+      "grad_norm": 0.5448907613754272,
+      "learning_rate": 4.528637450997583e-05,
+      "loss": 0.7796,
+      "step": 44500
+    },
+    {
+      "epoch": 27.898326100433973,
+      "grad_norm": 0.5504478216171265,
+      "learning_rate": 4.466014553561365e-05,
+      "loss": 0.7761,
+      "step": 45000
+    },
+    {
+      "epoch": 28.208307501549907,
+      "grad_norm": 0.544154703617096,
+      "learning_rate": 4.403391656125146e-05,
+      "loss": 0.7753,
+      "step": 45500
+    },
+    {
+      "epoch": 28.51828890266584,
+      "grad_norm": 0.542306125164032,
+      "learning_rate": 4.3407687586889274e-05,
+      "loss": 0.7735,
+      "step": 46000
+    },
+    {
+      "epoch": 28.828270303781775,
+      "grad_norm": 0.5549866557121277,
+      "learning_rate": 4.278145861252709e-05,
+      "loss": 0.7707,
+      "step": 46500
+    },
+    {
+      "epoch": 29.138251704897705,
+      "grad_norm": 0.538090169429779,
+      "learning_rate": 4.21552296381649e-05,
+      "loss": 0.7697,
+      "step": 47000
+    },
+    {
+      "epoch": 29.44823310601364,
+      "grad_norm": 0.5609955191612244,
+      "learning_rate": 4.1529000663802714e-05,
+      "loss": 0.7682,
+      "step": 47500
+    },
+    {
+      "epoch": 29.758214507129573,
+      "grad_norm": 0.5595529675483704,
+      "learning_rate": 4.090277168944053e-05,
+      "loss": 0.7659,
+      "step": 48000
+    },
+    {
+      "epoch": 30.068195908245507,
+      "grad_norm": 0.5461651086807251,
+      "learning_rate": 4.027654271507834e-05,
+      "loss": 0.7656,
+      "step": 48500
+    },
+    {
+      "epoch": 30.378177309361437,
+      "grad_norm": 0.5438820719718933,
+      "learning_rate": 3.9650313740716154e-05,
+      "loss": 0.7625,
+      "step": 49000
+    },
+    {
+      "epoch": 30.68815871047737,
+      "grad_norm": 0.5458811521530151,
+      "learning_rate": 3.902408476635397e-05,
+      "loss": 0.762,
+      "step": 49500
+    },
+    {
+      "epoch": 30.998140111593305,
+      "grad_norm": 0.535521388053894,
+      "learning_rate": 3.839785579199179e-05,
+      "loss": 0.7589,
+      "step": 50000
+    },
+    {
+      "epoch": 31.30812151270924,
+      "grad_norm": 0.5407618284225464,
+      "learning_rate": 3.77716268176296e-05,
+      "loss": 0.7576,
+      "step": 50500
+    },
+    {
+      "epoch": 31.61810291382517,
+      "grad_norm": 0.5259741544723511,
+      "learning_rate": 3.7145397843267415e-05,
+      "loss": 0.7571,
+      "step": 51000
+    },
+    {
+      "epoch": 31.928084314941103,
+      "grad_norm": 0.5338233709335327,
+      "learning_rate": 3.651916886890523e-05,
+      "loss": 0.7561,
+      "step": 51500
+    },
+    {
+      "epoch": 32.23806571605704,
+      "grad_norm": 0.5369750261306763,
+      "learning_rate": 3.589293989454304e-05,
+      "loss": 0.7541,
+      "step": 52000
+    },
+    {
+      "epoch": 32.54804711717297,
+      "grad_norm": 0.5418145656585693,
+      "learning_rate": 3.5266710920180856e-05,
+      "loss": 0.7521,
+      "step": 52500
+    },
+    {
+      "epoch": 32.858028518288904,
+      "grad_norm": 0.533149242401123,
+      "learning_rate": 3.464048194581867e-05,
+      "loss": 0.7519,
+      "step": 53000
+    },
+    {
+      "epoch": 33.16800991940484,
+      "grad_norm": 0.5384135246276855,
+      "learning_rate": 3.401425297145648e-05,
+      "loss": 0.7497,
+      "step": 53500
+    },
+    {
+      "epoch": 33.47799132052077,
+      "grad_norm": 0.5323925018310547,
+      "learning_rate": 3.3388023997094296e-05,
+      "loss": 0.7485,
+      "step": 54000
+    },
+    {
+      "epoch": 33.7879727216367,
+      "grad_norm": 0.535434901714325,
+      "learning_rate": 3.276179502273211e-05,
+      "loss": 0.7472,
+      "step": 54500
+    },
+    {
+      "epoch": 34.09795412275263,
+      "grad_norm": 0.5496259331703186,
+      "learning_rate": 3.213556604836992e-05,
+      "loss": 0.7454,
+      "step": 55000
+    },
+    {
+      "epoch": 34.40793552386857,
+      "grad_norm": 0.5429278016090393,
+      "learning_rate": 3.150933707400774e-05,
+      "loss": 0.7447,
+      "step": 55500
+    },
+    {
+      "epoch": 34.7179169249845,
+      "grad_norm": 0.5489596724510193,
+      "learning_rate": 3.088310809964556e-05,
+      "loss": 0.7438,
+      "step": 56000
+    },
+    {
+      "epoch": 35.027898326100434,
+      "grad_norm": 0.5510178208351135,
+      "learning_rate": 3.025687912528337e-05,
+      "loss": 0.7416,
+      "step": 56500
+    },
+    {
+      "epoch": 35.33787972721637,
+      "grad_norm": 0.5540343523025513,
+      "learning_rate": 2.9630650150921187e-05,
+      "loss": 0.7401,
+      "step": 57000
+    },
+    {
+      "epoch": 35.6478611283323,
+      "grad_norm": 0.551895260810852,
+      "learning_rate": 2.9004421176559e-05,
+      "loss": 0.7404,
+      "step": 57500
+    },
+    {
+      "epoch": 35.957842529448236,
+      "grad_norm": 0.5412101149559021,
+      "learning_rate": 2.8378192202196814e-05,
+      "loss": 0.74,
+      "step": 58000
+    },
+    {
+      "epoch": 36.26782393056417,
+      "grad_norm": 0.5450315475463867,
+      "learning_rate": 2.7751963227834627e-05,
+      "loss": 0.7386,
+      "step": 58500
+    },
+    {
+      "epoch": 36.577805331680096,
+      "grad_norm": 0.5550098419189453,
+      "learning_rate": 2.712573425347244e-05,
+      "loss": 0.7382,
+      "step": 59000
+    },
+    {
+      "epoch": 36.88778673279603,
+      "grad_norm": 0.5502198338508606,
+      "learning_rate": 2.6499505279110254e-05,
+      "loss": 0.7345,
+      "step": 59500
+    },
+    {
+      "epoch": 37.197768133911964,
+      "grad_norm": 0.5401105880737305,
+      "learning_rate": 2.587452876269679e-05,
+      "loss": 0.7355,
+      "step": 60000
+    },
+    {
+      "epoch": 37.5077495350279,
+      "grad_norm": 0.543369710445404,
+      "learning_rate": 2.5248299788334605e-05,
+      "loss": 0.7338,
+      "step": 60500
+    },
+    {
+      "epoch": 37.81773093614383,
+      "grad_norm": 0.5440373420715332,
+      "learning_rate": 2.4622070813972422e-05,
+      "loss": 0.7326,
+      "step": 61000
+    },
+    {
+      "epoch": 38.127712337259766,
+      "grad_norm": 0.5450806021690369,
+      "learning_rate": 2.3995841839610235e-05,
+      "loss": 0.7315,
+      "step": 61500
+    },
+    {
+      "epoch": 38.4376937383757,
+      "grad_norm": 0.5412734746932983,
+      "learning_rate": 2.336961286524805e-05,
+      "loss": 0.7301,
+      "step": 62000
+    },
+    {
+      "epoch": 38.74767513949163,
+      "grad_norm": 0.5553017854690552,
+      "learning_rate": 2.274463634883459e-05,
+      "loss": 0.732,
+      "step": 62500
+    },
+    {
+      "epoch": 39.05765654060756,
+      "grad_norm": 0.5467730164527893,
+      "learning_rate": 2.2118407374472403e-05,
+      "loss": 0.7289,
+      "step": 63000
+    },
+    {
+      "epoch": 39.367637941723494,
+      "grad_norm": 0.551267683506012,
+      "learning_rate": 2.1492178400110216e-05,
+      "loss": 0.728,
+      "step": 63500
+    },
+    {
+      "epoch": 39.67761934283943,
+      "grad_norm": 0.5391538739204407,
+      "learning_rate": 2.0865949425748033e-05,
+      "loss": 0.7276,
+      "step": 64000
+    },
+    {
+      "epoch": 39.98760074395536,
+      "grad_norm": 0.5523350238800049,
+      "learning_rate": 2.0239720451385847e-05,
+      "loss": 0.7272,
+      "step": 64500
+    },
+    {
+      "epoch": 40.297582145071296,
+      "grad_norm": 0.5367141366004944,
+      "learning_rate": 1.961349147702366e-05,
+      "loss": 0.726,
+      "step": 65000
+    },
+    {
+      "epoch": 40.60756354618723,
+      "grad_norm": 0.5538766980171204,
+      "learning_rate": 1.8987262502661473e-05,
+      "loss": 0.7238,
+      "step": 65500
+    },
+    {
+      "epoch": 40.91754494730316,
+      "grad_norm": 0.5274632573127747,
+      "learning_rate": 1.8361033528299287e-05,
+      "loss": 0.725,
+      "step": 66000
+    },
+    {
+      "epoch": 41.2275263484191,
+      "grad_norm": 0.521597146987915,
+      "learning_rate": 1.7736057011885827e-05,
+      "loss": 0.7233,
+      "step": 66500
+    },
+    {
+      "epoch": 41.53750774953503,
+      "grad_norm": 0.5390001535415649,
+      "learning_rate": 1.710982803752364e-05,
+      "loss": 0.7225,
+      "step": 67000
+    },
+    {
+      "epoch": 41.84748915065096,
+      "grad_norm": 0.5474331378936768,
+      "learning_rate": 1.6483599063161458e-05,
+      "loss": 0.7218,
+      "step": 67500
+    },
+    {
+      "epoch": 42.15747055176689,
+      "grad_norm": 0.5352886915206909,
+      "learning_rate": 1.5858622546747995e-05,
+      "loss": 0.7213,
+      "step": 68000
+    },
+    {
+      "epoch": 42.467451952882826,
+      "grad_norm": 0.540053129196167,
+      "learning_rate": 1.5232393572385808e-05,
+      "loss": 0.7204,
+      "step": 68500
+    },
+    {
+      "epoch": 42.77743335399876,
+      "grad_norm": 0.5470998883247375,
+      "learning_rate": 1.4606164598023622e-05,
+      "loss": 0.721,
+      "step": 69000
+    },
+    {
+      "epoch": 43.08741475511469,
+      "grad_norm": 0.5613588094711304,
+      "learning_rate": 1.3979935623661435e-05,
+      "loss": 0.7194,
+      "step": 69500
+    },
+    {
+      "epoch": 43.39739615623063,
+      "grad_norm": 0.5471562743186951,
+      "learning_rate": 1.3354959107247974e-05,
+      "loss": 0.7178,
+      "step": 70000
+    },
+    {
+      "epoch": 43.70737755734656,
+      "grad_norm": 0.5386627912521362,
+      "learning_rate": 1.2728730132885787e-05,
+      "loss": 0.7184,
+      "step": 70500
+    },
+    {
+      "epoch": 44.017358958462495,
+      "grad_norm": 0.5391978621482849,
+      "learning_rate": 1.2102501158523603e-05,
+      "loss": 0.7186,
+      "step": 71000
+    },
+    {
+      "epoch": 44.32734035957843,
+      "grad_norm": 0.5381629467010498,
+      "learning_rate": 1.1476272184161418e-05,
+      "loss": 0.7168,
+      "step": 71500
+    },
+    {
+      "epoch": 44.637321760694356,
+      "grad_norm": 0.5467249155044556,
+      "learning_rate": 1.0850043209799233e-05,
+      "loss": 0.7162,
+      "step": 72000
+    },
+    {
+      "epoch": 44.94730316181029,
+      "grad_norm": 0.5548228025436401,
+      "learning_rate": 1.0223814235437046e-05,
+      "loss": 0.7146,
+      "step": 72500
+    },
+    {
+      "epoch": 45.25728456292622,
+      "grad_norm": 0.5488151907920837,
+      "learning_rate": 9.59758526107486e-06,
+      "loss": 0.7152,
+      "step": 73000
+    },
+    {
+      "epoch": 45.56726596404216,
+      "grad_norm": 0.5473387241363525,
+      "learning_rate": 8.971356286712675e-06,
+      "loss": 0.7142,
+      "step": 73500
+    },
+    {
+      "epoch": 45.87724736515809,
+      "grad_norm": 0.5331913828849792,
+      "learning_rate": 8.345127312350489e-06,
+      "loss": 0.7155,
+      "step": 74000
+    },
+    {
+      "epoch": 46.187228766274025,
+      "grad_norm": 0.5443392395973206,
+      "learning_rate": 7.718898337988302e-06,
+      "loss": 0.7136,
+      "step": 74500
+    },
+    {
+      "epoch": 46.49721016738996,
+      "grad_norm": 0.5461409091949463,
+      "learning_rate": 7.092669363626117e-06,
+      "loss": 0.7148,
+      "step": 75000
+    },
+    {
+      "epoch": 46.80719156850589,
+      "grad_norm": 0.5504785180091858,
+      "learning_rate": 6.466440389263931e-06,
+      "loss": 0.7133,
+      "step": 75500
+    },
+    {
+      "epoch": 47.11717296962182,
+      "grad_norm": 0.5478015542030334,
+      "learning_rate": 5.840211414901745e-06,
+      "loss": 0.7125,
+      "step": 76000
+    },
+    {
+      "epoch": 47.42715437073775,
+      "grad_norm": 0.5464319586753845,
+      "learning_rate": 5.2139824405395585e-06,
+      "loss": 0.7125,
+      "step": 76500
+    },
+    {
+      "epoch": 47.73713577185369,
+      "grad_norm": 0.5370163321495056,
+      "learning_rate": 4.587753466177374e-06,
+      "loss": 0.7117,
+      "step": 77000
+    },
+    {
+      "epoch": 48.04711717296962,
+      "grad_norm": 0.5529221892356873,
+      "learning_rate": 3.961524491815188e-06,
+      "loss": 0.711,
+      "step": 77500
+    },
+    {
+      "epoch": 48.357098574085555,
+      "grad_norm": 0.549679160118103,
+      "learning_rate": 3.3352955174530015e-06,
+      "loss": 0.7112,
+      "step": 78000
+    },
+    {
+      "epoch": 48.66707997520149,
+      "grad_norm": 0.5416662096977234,
+      "learning_rate": 2.709066543090816e-06,
+      "loss": 0.7112,
+      "step": 78500
+    },
+    {
+      "epoch": 48.97706137631742,
+      "grad_norm": 0.5428098440170288,
+      "learning_rate": 2.08283756872863e-06,
+      "loss": 0.7109,
+      "step": 79000
+    },
+    {
+      "epoch": 49.287042777433356,
+      "grad_norm": 0.5247154235839844,
+      "learning_rate": 1.4566085943664442e-06,
+      "loss": 0.7106,
+      "step": 79500
+    },
+    {
+      "epoch": 49.59702417854929,
+      "grad_norm": 0.5486724376678467,
+      "learning_rate": 8.303796200042584e-07,
+      "loss": 0.7097,
+      "step": 80000
+    },
+    {
+      "epoch": 49.90700557966522,
+      "grad_norm": 0.5495786070823669,
+      "learning_rate": 2.0415064564207257e-07,
+      "loss": 0.7106,
+      "step": 80500
+    },
+    {
+      "epoch": 50.0,
+      "step": 80650,
+      "total_flos": 2.052104150815488e+18,
+      "train_loss": 0.04098836247254364,
+      "train_runtime": 10357.3823,
+      "train_samples_per_second": 11959.61,
+      "train_steps_per_second": 7.787
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 80650,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.052104150815488e+18,
+  "train_batch_size": 192,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd30ff3a126c35a62842c1cfc50eb0b17ba4cb1c25b1391133e9d8cdeca49418
+size 5112