Upload folder using huggingface_hub
Browse files- adapter_config.json +4 -4
 - adapter_model.safetensors +1 -1
 - checkpoint-1245/adapter_config.json +4 -4
 - checkpoint-1245/adapter_model.safetensors +1 -1
 - checkpoint-1245/optimizer.pt +1 -1
 - checkpoint-1245/rng_state.pth +1 -1
 - checkpoint-1245/scheduler.pt +1 -1
 - checkpoint-1245/trainer_state.json +263 -263
 - checkpoint-1245/training_args.bin +1 -1
 - checkpoint-1660/adapter_config.json +4 -4
 - checkpoint-1660/adapter_model.safetensors +1 -1
 - checkpoint-1660/optimizer.pt +1 -1
 - checkpoint-1660/rng_state.pth +1 -1
 - checkpoint-1660/scheduler.pt +1 -1
 - checkpoint-1660/trainer_state.json +353 -353
 - checkpoint-1660/training_args.bin +1 -1
 - checkpoint-2075/adapter_config.json +4 -4
 - checkpoint-2075/adapter_model.safetensors +1 -1
 - checkpoint-2075/optimizer.pt +1 -1
 - checkpoint-2075/rng_state.pth +1 -1
 - checkpoint-2075/scheduler.pt +1 -1
 - checkpoint-2075/trainer_state.json +443 -443
 - checkpoint-2075/training_args.bin +1 -1
 - checkpoint-415/adapter_config.json +4 -4
 - checkpoint-415/adapter_model.safetensors +1 -1
 - checkpoint-415/optimizer.pt +1 -1
 - checkpoint-415/scheduler.pt +1 -1
 - checkpoint-415/trainer_state.json +88 -88
 - checkpoint-415/training_args.bin +1 -1
 - checkpoint-830/adapter_config.json +4 -4
 - checkpoint-830/adapter_model.safetensors +1 -1
 - checkpoint-830/optimizer.pt +1 -1
 - checkpoint-830/rng_state.pth +1 -1
 - checkpoint-830/scheduler.pt +1 -1
 - checkpoint-830/trainer_state.json +178 -178
 - checkpoint-830/training_args.bin +1 -1
 - runs/Aug06_01-20-31_pan/events.out.tfevents.1754457633.pan.717279.0 +3 -0
 - runs/Aug06_01-20-31_pan/events.out.tfevents.1754464590.pan.717279.1 +3 -0
 - runs/Aug06_10-56-17_pan/events.out.tfevents.1754492237.pan.744812.0 +3 -0
 
    	
        adapter_config.json
    CHANGED
    
    | 
         @@ -25,12 +25,12 @@ 
     | 
|
| 25 | 
         
             
              "revision": null,
         
     | 
| 26 | 
         
             
              "target_modules": [
         
     | 
| 27 | 
         
             
                "gate_proj",
         
     | 
| 28 | 
         
            -
                " 
     | 
| 
         | 
|
| 29 | 
         
             
                "down_proj",
         
     | 
| 30 | 
         
             
                "o_proj",
         
     | 
| 31 | 
         
            -
                " 
     | 
| 32 | 
         
            -
                "up_proj" 
     | 
| 33 | 
         
            -
                "q_proj"
         
     | 
| 34 | 
         
             
              ],
         
     | 
| 35 | 
         
             
              "task_type": "CAUSAL_LM",
         
     | 
| 36 | 
         
             
              "trainable_token_indices": null,
         
     | 
| 
         | 
|
| 25 | 
         
             
              "revision": null,
         
     | 
| 26 | 
         
             
              "target_modules": [
         
     | 
| 27 | 
         
             
                "gate_proj",
         
     | 
| 28 | 
         
            +
                "v_proj",
         
     | 
| 29 | 
         
            +
                "q_proj",
         
     | 
| 30 | 
         
             
                "down_proj",
         
     | 
| 31 | 
         
             
                "o_proj",
         
     | 
| 32 | 
         
            +
                "k_proj",
         
     | 
| 33 | 
         
            +
                "up_proj"
         
     | 
| 
         | 
|
| 34 | 
         
             
              ],
         
     | 
| 35 | 
         
             
              "task_type": "CAUSAL_LM",
         
     | 
| 36 | 
         
             
              "trainable_token_indices": null,
         
     | 
    	
        adapter_model.safetensors
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 335604696
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:9f5defc89229b64935bf3a4cdd33bed60f970fb87012f2182df603c88c1df0f6
         
     | 
| 3 | 
         
             
            size 335604696
         
     | 
    	
        checkpoint-1245/adapter_config.json
    CHANGED
    
    | 
         @@ -25,12 +25,12 @@ 
     | 
|
| 25 | 
         
             
              "revision": null,
         
     | 
| 26 | 
         
             
              "target_modules": [
         
     | 
| 27 | 
         
             
                "gate_proj",
         
     | 
| 28 | 
         
            -
                " 
     | 
| 
         | 
|
| 29 | 
         
             
                "down_proj",
         
     | 
| 30 | 
         
             
                "o_proj",
         
     | 
| 31 | 
         
            -
                " 
     | 
| 32 | 
         
            -
                "up_proj" 
     | 
| 33 | 
         
            -
                "q_proj"
         
     | 
| 34 | 
         
             
              ],
         
     | 
| 35 | 
         
             
              "task_type": "CAUSAL_LM",
         
     | 
| 36 | 
         
             
              "trainable_token_indices": null,
         
     | 
| 
         | 
|
| 25 | 
         
             
              "revision": null,
         
     | 
| 26 | 
         
             
              "target_modules": [
         
     | 
| 27 | 
         
             
                "gate_proj",
         
     | 
| 28 | 
         
            +
                "v_proj",
         
     | 
| 29 | 
         
            +
                "q_proj",
         
     | 
| 30 | 
         
             
                "down_proj",
         
     | 
| 31 | 
         
             
                "o_proj",
         
     | 
| 32 | 
         
            +
                "k_proj",
         
     | 
| 33 | 
         
            +
                "up_proj"
         
     | 
| 
         | 
|
| 34 | 
         
             
              ],
         
     | 
| 35 | 
         
             
              "task_type": "CAUSAL_LM",
         
     | 
| 36 | 
         
             
              "trainable_token_indices": null,
         
     | 
    	
        checkpoint-1245/adapter_model.safetensors
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 335604696
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:714d28ff52502beaa7aa663f61bf90883f1d5a6d8f7c44543a8c774acad41914
         
     | 
| 3 | 
         
             
            size 335604696
         
     | 
    	
        checkpoint-1245/optimizer.pt
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 671365003
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:241dd5102c399d348ae08a4e6d1f2ae7224df4fbf30e0b7ab517db05b9bcbdc4
         
     | 
| 3 | 
         
             
            size 671365003
         
     | 
    	
        checkpoint-1245/rng_state.pth
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 14645
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:eaf298bdeef62c77ad6faee52a0cfd162ba360b2efe097f86517c2c7b20b1051
         
     | 
| 3 | 
         
             
            size 14645
         
     | 
    	
        checkpoint-1245/scheduler.pt
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 1465
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:afcf24a919c10327f09cf2cfb38a92229f10d5eecb3efe6c943205819e281b06
         
     | 
| 3 | 
         
             
            size 1465
         
     | 
    	
        checkpoint-1245/trainer_state.json
    CHANGED
    
    | 
         @@ -11,480 +11,480 @@ 
     | 
|
| 11 | 
         
             
              "log_history": [
         
     | 
| 12 | 
         
             
                {
         
     | 
| 13 | 
         
             
                  "epoch": 0.060350030175015085,
         
     | 
| 14 | 
         
            -
                  "grad_norm": 0. 
     | 
| 15 | 
         
            -
                  "learning_rate":  
     | 
| 16 | 
         
            -
                  "loss": 1. 
     | 
| 17 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 18 | 
         
            -
                  "num_tokens":  
     | 
| 19 | 
         
             
                  "step": 25
         
     | 
| 20 | 
         
             
                },
         
     | 
| 21 | 
         
             
                {
         
     | 
| 22 | 
         
             
                  "epoch": 0.12070006035003017,
         
     | 
| 23 | 
         
            -
                  "grad_norm": 0. 
     | 
| 24 | 
         
            -
                  "learning_rate": 0. 
     | 
| 25 | 
         
            -
                  "loss": 0. 
     | 
| 26 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 27 | 
         
            -
                  "num_tokens":  
     | 
| 28 | 
         
             
                  "step": 50
         
     | 
| 29 | 
         
             
                },
         
     | 
| 30 | 
         
             
                {
         
     | 
| 31 | 
         
             
                  "epoch": 0.18105009052504525,
         
     | 
| 32 | 
         
            -
                  "grad_norm": 0. 
     | 
| 33 | 
         
            -
                  "learning_rate": 0. 
     | 
| 34 | 
         
            -
                  "loss": 0. 
     | 
| 35 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 36 | 
         
            -
                  "num_tokens":  
     | 
| 37 | 
         
             
                  "step": 75
         
     | 
| 38 | 
         
             
                },
         
     | 
| 39 | 
         
             
                {
         
     | 
| 40 | 
         
             
                  "epoch": 0.24140012070006034,
         
     | 
| 41 | 
         
            -
                  "grad_norm": 0. 
     | 
| 42 | 
         
            -
                  "learning_rate": 0. 
     | 
| 43 | 
         
            -
                  "loss": 0. 
     | 
| 44 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 45 | 
         
            -
                  "num_tokens":  
     | 
| 46 | 
         
             
                  "step": 100
         
     | 
| 47 | 
         
             
                },
         
     | 
| 48 | 
         
             
                {
         
     | 
| 49 | 
         
             
                  "epoch": 0.30175015087507545,
         
     | 
| 50 | 
         
            -
                  "grad_norm": 0. 
     | 
| 51 | 
         
            -
                  "learning_rate": 0. 
     | 
| 52 | 
         
            -
                  "loss": 0. 
     | 
| 53 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 54 | 
         
            -
                  "num_tokens":  
     | 
| 55 | 
         
             
                  "step": 125
         
     | 
| 56 | 
         
             
                },
         
     | 
| 57 | 
         
             
                {
         
     | 
| 58 | 
         
             
                  "epoch": 0.3621001810500905,
         
     | 
| 59 | 
         
            -
                  "grad_norm":  
     | 
| 60 | 
         
            -
                  "learning_rate": 0. 
     | 
| 61 | 
         
            -
                  "loss": 0. 
     | 
| 62 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 63 | 
         
            -
                  "num_tokens":  
     | 
| 64 | 
         
             
                  "step": 150
         
     | 
| 65 | 
         
             
                },
         
     | 
| 66 | 
         
             
                {
         
     | 
| 67 | 
         
             
                  "epoch": 0.4224502112251056,
         
     | 
| 68 | 
         
            -
                  "grad_norm": 0. 
     | 
| 69 | 
         
            -
                  "learning_rate": 0. 
     | 
| 70 | 
         
            -
                  "loss": 0. 
     | 
| 71 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 72 | 
         
            -
                  "num_tokens":  
     | 
| 73 | 
         
             
                  "step": 175
         
     | 
| 74 | 
         
             
                },
         
     | 
| 75 | 
         
             
                {
         
     | 
| 76 | 
         
             
                  "epoch": 0.4828002414001207,
         
     | 
| 77 | 
         
            -
                  "grad_norm": 0. 
     | 
| 78 | 
         
            -
                  "learning_rate": 0. 
     | 
| 79 | 
         
            -
                  "loss": 0. 
     | 
| 80 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 81 | 
         
            -
                  "num_tokens":  
     | 
| 82 | 
         
             
                  "step": 200
         
     | 
| 83 | 
         
             
                },
         
     | 
| 84 | 
         
             
                {
         
     | 
| 85 | 
         
             
                  "epoch": 0.5431502715751357,
         
     | 
| 86 | 
         
            -
                  "grad_norm": 0. 
     | 
| 87 | 
         
            -
                  "learning_rate": 0. 
     | 
| 88 | 
         
            -
                  "loss": 0. 
     | 
| 89 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 90 | 
         
            -
                  "num_tokens":  
     | 
| 91 | 
         
             
                  "step": 225
         
     | 
| 92 | 
         
             
                },
         
     | 
| 93 | 
         
             
                {
         
     | 
| 94 | 
         
             
                  "epoch": 0.6035003017501509,
         
     | 
| 95 | 
         
            -
                  "grad_norm": 0. 
     | 
| 96 | 
         
            -
                  "learning_rate": 0. 
     | 
| 97 | 
         
            -
                  "loss": 0. 
     | 
| 98 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 99 | 
         
            -
                  "num_tokens":  
     | 
| 100 | 
         
             
                  "step": 250
         
     | 
| 101 | 
         
             
                },
         
     | 
| 102 | 
         
             
                {
         
     | 
| 103 | 
         
             
                  "epoch": 0.663850331925166,
         
     | 
| 104 | 
         
            -
                  "grad_norm": 0. 
     | 
| 105 | 
         
            -
                  "learning_rate": 0. 
     | 
| 106 | 
         
            -
                  "loss": 0. 
     | 
| 107 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 108 | 
         
            -
                  "num_tokens":  
     | 
| 109 | 
         
             
                  "step": 275
         
     | 
| 110 | 
         
             
                },
         
     | 
| 111 | 
         
             
                {
         
     | 
| 112 | 
         
             
                  "epoch": 0.724200362100181,
         
     | 
| 113 | 
         
            -
                  "grad_norm": 0. 
     | 
| 114 | 
         
            -
                  "learning_rate": 0. 
     | 
| 115 | 
         
            -
                  "loss": 0. 
     | 
| 116 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 117 | 
         
            -
                  "num_tokens":  
     | 
| 118 | 
         
             
                  "step": 300
         
     | 
| 119 | 
         
             
                },
         
     | 
| 120 | 
         
             
                {
         
     | 
| 121 | 
         
             
                  "epoch": 0.7845503922751962,
         
     | 
| 122 | 
         
            -
                  "grad_norm": 0. 
     | 
| 123 | 
         
            -
                  "learning_rate": 0. 
     | 
| 124 | 
         
            -
                  "loss": 0. 
     | 
| 125 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 126 | 
         
            -
                  "num_tokens":  
     | 
| 127 | 
         
             
                  "step": 325
         
     | 
| 128 | 
         
             
                },
         
     | 
| 129 | 
         
             
                {
         
     | 
| 130 | 
         
             
                  "epoch": 0.8449004224502112,
         
     | 
| 131 | 
         
            -
                  "grad_norm": 0. 
     | 
| 132 | 
         
            -
                  "learning_rate": 0. 
     | 
| 133 | 
         
            -
                  "loss": 0. 
     | 
| 134 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 135 | 
         
            -
                  "num_tokens":  
     | 
| 136 | 
         
             
                  "step": 350
         
     | 
| 137 | 
         
             
                },
         
     | 
| 138 | 
         
             
                {
         
     | 
| 139 | 
         
             
                  "epoch": 0.9052504526252263,
         
     | 
| 140 | 
         
            -
                  "grad_norm": 0. 
     | 
| 141 | 
         
            -
                  "learning_rate": 0. 
     | 
| 142 | 
         
            -
                  "loss": 0. 
     | 
| 143 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 144 | 
         
            -
                  "num_tokens":  
     | 
| 145 | 
         
             
                  "step": 375
         
     | 
| 146 | 
         
             
                },
         
     | 
| 147 | 
         
             
                {
         
     | 
| 148 | 
         
             
                  "epoch": 0.9656004828002414,
         
     | 
| 149 | 
         
            -
                  "grad_norm": 0. 
     | 
| 150 | 
         
            -
                  "learning_rate": 0. 
     | 
| 151 | 
         
            -
                  "loss": 0. 
     | 
| 152 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 153 | 
         
            -
                  "num_tokens":  
     | 
| 154 | 
         
             
                  "step": 400
         
     | 
| 155 | 
         
             
                },
         
     | 
| 156 | 
         
             
                {
         
     | 
| 157 | 
         
             
                  "epoch": 1.0,
         
     | 
| 158 | 
         
            -
                  "eval_loss": 0. 
     | 
| 159 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 160 | 
         
             
                  "eval_num_tokens": 2223513.0,
         
     | 
| 161 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 162 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 163 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 164 | 
         
             
                  "step": 415
         
     | 
| 165 | 
         
             
                },
         
     | 
| 166 | 
         
             
                {
         
     | 
| 167 | 
         
             
                  "epoch": 1.024140012070006,
         
     | 
| 168 | 
         
            -
                  "grad_norm": 0. 
     | 
| 169 | 
         
            -
                  "learning_rate": 0. 
     | 
| 170 | 
         
            -
                  "loss": 0. 
     | 
| 171 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 172 | 
         
            -
                  "num_tokens":  
     | 
| 173 | 
         
             
                  "step": 425
         
     | 
| 174 | 
         
             
                },
         
     | 
| 175 | 
         
             
                {
         
     | 
| 176 | 
         
             
                  "epoch": 1.0844900422450212,
         
     | 
| 177 | 
         
            -
                  "grad_norm": 0. 
     | 
| 178 | 
         
            -
                  "learning_rate": 0. 
     | 
| 179 | 
         
            -
                  "loss": 0. 
     | 
| 180 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 181 | 
         
            -
                  "num_tokens":  
     | 
| 182 | 
         
             
                  "step": 450
         
     | 
| 183 | 
         
             
                },
         
     | 
| 184 | 
         
             
                {
         
     | 
| 185 | 
         
             
                  "epoch": 1.1448400724200363,
         
     | 
| 186 | 
         
            -
                  "grad_norm": 0. 
     | 
| 187 | 
         
            -
                  "learning_rate": 0. 
     | 
| 188 | 
         
            -
                  "loss": 0. 
     | 
| 189 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 190 | 
         
            -
                  "num_tokens":  
     | 
| 191 | 
         
             
                  "step": 475
         
     | 
| 192 | 
         
             
                },
         
     | 
| 193 | 
         
             
                {
         
     | 
| 194 | 
         
             
                  "epoch": 1.2051901025950513,
         
     | 
| 195 | 
         
            -
                  "grad_norm": 0. 
     | 
| 196 | 
         
            -
                  "learning_rate": 0. 
     | 
| 197 | 
         
            -
                  "loss": 0. 
     | 
| 198 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 199 | 
         
            -
                  "num_tokens":  
     | 
| 200 | 
         
             
                  "step": 500
         
     | 
| 201 | 
         
             
                },
         
     | 
| 202 | 
         
             
                {
         
     | 
| 203 | 
         
             
                  "epoch": 1.2655401327700664,
         
     | 
| 204 | 
         
            -
                  "grad_norm": 0. 
     | 
| 205 | 
         
            -
                  "learning_rate": 0. 
     | 
| 206 | 
         
            -
                  "loss": 0. 
     | 
| 207 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 208 | 
         
            -
                  "num_tokens":  
     | 
| 209 | 
         
             
                  "step": 525
         
     | 
| 210 | 
         
             
                },
         
     | 
| 211 | 
         
             
                {
         
     | 
| 212 | 
         
             
                  "epoch": 1.3258901629450814,
         
     | 
| 213 | 
         
            -
                  "grad_norm": 0. 
     | 
| 214 | 
         
            -
                  "learning_rate": 0. 
     | 
| 215 | 
         
            -
                  "loss": 0. 
     | 
| 216 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 217 | 
         
            -
                  "num_tokens":  
     | 
| 218 | 
         
             
                  "step": 550
         
     | 
| 219 | 
         
             
                },
         
     | 
| 220 | 
         
             
                {
         
     | 
| 221 | 
         
             
                  "epoch": 1.3862401931200965,
         
     | 
| 222 | 
         
            -
                  "grad_norm": 0. 
     | 
| 223 | 
         
            -
                  "learning_rate": 0. 
     | 
| 224 | 
         
            -
                  "loss": 0. 
     | 
| 225 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 226 | 
         
            -
                  "num_tokens":  
     | 
| 227 | 
         
             
                  "step": 575
         
     | 
| 228 | 
         
             
                },
         
     | 
| 229 | 
         
             
                {
         
     | 
| 230 | 
         
             
                  "epoch": 1.4465902232951118,
         
     | 
| 231 | 
         
            -
                  "grad_norm": 0. 
     | 
| 232 | 
         
            -
                  "learning_rate": 0. 
     | 
| 233 | 
         
            -
                  "loss": 0. 
     | 
| 234 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 235 | 
         
            -
                  "num_tokens":  
     | 
| 236 | 
         
             
                  "step": 600
         
     | 
| 237 | 
         
             
                },
         
     | 
| 238 | 
         
             
                {
         
     | 
| 239 | 
         
             
                  "epoch": 1.5069402534701268,
         
     | 
| 240 | 
         
            -
                  "grad_norm": 0. 
     | 
| 241 | 
         
            -
                  "learning_rate": 0. 
     | 
| 242 | 
         
            -
                  "loss": 0. 
     | 
| 243 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 244 | 
         
            -
                  "num_tokens":  
     | 
| 245 | 
         
             
                  "step": 625
         
     | 
| 246 | 
         
             
                },
         
     | 
| 247 | 
         
             
                {
         
     | 
| 248 | 
         
             
                  "epoch": 1.567290283645142,
         
     | 
| 249 | 
         
            -
                  "grad_norm": 0. 
     | 
| 250 | 
         
            -
                  "learning_rate": 0. 
     | 
| 251 | 
         
            -
                  "loss": 0. 
     | 
| 252 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 253 | 
         
            -
                  "num_tokens":  
     | 
| 254 | 
         
             
                  "step": 650
         
     | 
| 255 | 
         
             
                },
         
     | 
| 256 | 
         
             
                {
         
     | 
| 257 | 
         
             
                  "epoch": 1.627640313820157,
         
     | 
| 258 | 
         
            -
                  "grad_norm": 0. 
     | 
| 259 | 
         
            -
                  "learning_rate": 0. 
     | 
| 260 | 
         
            -
                  "loss": 0. 
     | 
| 261 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 262 | 
         
            -
                  "num_tokens":  
     | 
| 263 | 
         
             
                  "step": 675
         
     | 
| 264 | 
         
             
                },
         
     | 
| 265 | 
         
             
                {
         
     | 
| 266 | 
         
             
                  "epoch": 1.687990343995172,
         
     | 
| 267 | 
         
            -
                  "grad_norm": 0. 
     | 
| 268 | 
         
            -
                  "learning_rate": 0. 
     | 
| 269 | 
         
            -
                  "loss": 0. 
     | 
| 270 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 271 | 
         
            -
                  "num_tokens":  
     | 
| 272 | 
         
             
                  "step": 700
         
     | 
| 273 | 
         
             
                },
         
     | 
| 274 | 
         
             
                {
         
     | 
| 275 | 
         
             
                  "epoch": 1.748340374170187,
         
     | 
| 276 | 
         
            -
                  "grad_norm": 0. 
     | 
| 277 | 
         
            -
                  "learning_rate": 0. 
     | 
| 278 | 
         
            -
                  "loss": 0. 
     | 
| 279 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 280 | 
         
            -
                  "num_tokens":  
     | 
| 281 | 
         
             
                  "step": 725
         
     | 
| 282 | 
         
             
                },
         
     | 
| 283 | 
         
             
                {
         
     | 
| 284 | 
         
             
                  "epoch": 1.8086904043452021,
         
     | 
| 285 | 
         
            -
                  "grad_norm": 0. 
     | 
| 286 | 
         
            -
                  "learning_rate": 0. 
     | 
| 287 | 
         
            -
                  "loss": 0. 
     | 
| 288 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 289 | 
         
            -
                  "num_tokens":  
     | 
| 290 | 
         
             
                  "step": 750
         
     | 
| 291 | 
         
             
                },
         
     | 
| 292 | 
         
             
                {
         
     | 
| 293 | 
         
             
                  "epoch": 1.8690404345202172,
         
     | 
| 294 | 
         
            -
                  "grad_norm": 0. 
     | 
| 295 | 
         
            -
                  "learning_rate": 0. 
     | 
| 296 | 
         
            -
                  "loss": 0. 
     | 
| 297 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 298 | 
         
            -
                  "num_tokens":  
     | 
| 299 | 
         
             
                  "step": 775
         
     | 
| 300 | 
         
             
                },
         
     | 
| 301 | 
         
             
                {
         
     | 
| 302 | 
         
             
                  "epoch": 1.9293904646952322,
         
     | 
| 303 | 
         
            -
                  "grad_norm": 0. 
     | 
| 304 | 
         
            -
                  "learning_rate": 0. 
     | 
| 305 | 
         
            -
                  "loss": 0. 
     | 
| 306 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 307 | 
         
            -
                  "num_tokens":  
     | 
| 308 | 
         
             
                  "step": 800
         
     | 
| 309 | 
         
             
                },
         
     | 
| 310 | 
         
             
                {
         
     | 
| 311 | 
         
             
                  "epoch": 1.9897404948702473,
         
     | 
| 312 | 
         
            -
                  "grad_norm": 0. 
     | 
| 313 | 
         
            -
                  "learning_rate": 0. 
     | 
| 314 | 
         
            -
                  "loss": 0. 
     | 
| 315 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 316 | 
         
            -
                  "num_tokens":  
     | 
| 317 | 
         
             
                  "step": 825
         
     | 
| 318 | 
         
             
                },
         
     | 
| 319 | 
         
             
                {
         
     | 
| 320 | 
         
             
                  "epoch": 2.0,
         
     | 
| 321 | 
         
            -
                  "eval_loss": 0. 
     | 
| 322 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 323 | 
         
             
                  "eval_num_tokens": 4447026.0,
         
     | 
| 324 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 325 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 326 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 327 | 
         
             
                  "step": 830
         
     | 
| 328 | 
         
             
                },
         
     | 
| 329 | 
         
             
                {
         
     | 
| 330 | 
         
             
                  "epoch": 2.048280024140012,
         
     | 
| 331 | 
         
            -
                  "grad_norm": 0. 
     | 
| 332 | 
         
            -
                  "learning_rate": 0. 
     | 
| 333 | 
         
            -
                  "loss": 0. 
     | 
| 334 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 335 | 
         
            -
                  "num_tokens":  
     | 
| 336 | 
         
             
                  "step": 850
         
     | 
| 337 | 
         
             
                },
         
     | 
| 338 | 
         
             
                {
         
     | 
| 339 | 
         
             
                  "epoch": 2.1086300543150274,
         
     | 
| 340 | 
         
            -
                  "grad_norm": 0. 
     | 
| 341 | 
         
            -
                  "learning_rate": 0. 
     | 
| 342 | 
         
            -
                  "loss": 0. 
     | 
| 343 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 344 | 
         
            -
                  "num_tokens":  
     | 
| 345 | 
         
             
                  "step": 875
         
     | 
| 346 | 
         
             
                },
         
     | 
| 347 | 
         
             
                {
         
     | 
| 348 | 
         
             
                  "epoch": 2.1689800844900424,
         
     | 
| 349 | 
         
            -
                  "grad_norm": 0. 
     | 
| 350 | 
         
            -
                  "learning_rate": 0. 
     | 
| 351 | 
         
            -
                  "loss": 0. 
     | 
| 352 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 353 | 
         
            -
                  "num_tokens":  
     | 
| 354 | 
         
             
                  "step": 900
         
     | 
| 355 | 
         
             
                },
         
     | 
| 356 | 
         
             
                {
         
     | 
| 357 | 
         
             
                  "epoch": 2.2293301146650575,
         
     | 
| 358 | 
         
            -
                  "grad_norm": 0. 
     | 
| 359 | 
         
            -
                  "learning_rate": 0. 
     | 
| 360 | 
         
            -
                  "loss": 0. 
     | 
| 361 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 362 | 
         
            -
                  "num_tokens":  
     | 
| 363 | 
         
             
                  "step": 925
         
     | 
| 364 | 
         
             
                },
         
     | 
| 365 | 
         
             
                {
         
     | 
| 366 | 
         
             
                  "epoch": 2.2896801448400725,
         
     | 
| 367 | 
         
            -
                  "grad_norm": 0. 
     | 
| 368 | 
         
            -
                  "learning_rate": 0. 
     | 
| 369 | 
         
            -
                  "loss": 0. 
     | 
| 370 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 371 | 
         
            -
                  "num_tokens":  
     | 
| 372 | 
         
             
                  "step": 950
         
     | 
| 373 | 
         
             
                },
         
     | 
| 374 | 
         
             
                {
         
     | 
| 375 | 
         
             
                  "epoch": 2.3500301750150876,
         
     | 
| 376 | 
         
            -
                  "grad_norm": 0. 
     | 
| 377 | 
         
            -
                  "learning_rate": 0. 
     | 
| 378 | 
         
            -
                  "loss": 0. 
     | 
| 379 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 380 | 
         
            -
                  "num_tokens":  
     | 
| 381 | 
         
             
                  "step": 975
         
     | 
| 382 | 
         
             
                },
         
     | 
| 383 | 
         
             
                {
         
     | 
| 384 | 
         
             
                  "epoch": 2.4103802051901027,
         
     | 
| 385 | 
         
            -
                  "grad_norm": 0. 
     | 
| 386 | 
         
            -
                  "learning_rate": 0. 
     | 
| 387 | 
         
            -
                  "loss": 0. 
     | 
| 388 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 389 | 
         
            -
                  "num_tokens":  
     | 
| 390 | 
         
             
                  "step": 1000
         
     | 
| 391 | 
         
             
                },
         
     | 
| 392 | 
         
             
                {
         
     | 
| 393 | 
         
             
                  "epoch": 2.4707302353651177,
         
     | 
| 394 | 
         
            -
                  "grad_norm": 0. 
     | 
| 395 | 
         
            -
                  "learning_rate": 0. 
     | 
| 396 | 
         
            -
                  "loss": 0. 
     | 
| 397 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 398 | 
         
            -
                  "num_tokens":  
     | 
| 399 | 
         
             
                  "step": 1025
         
     | 
| 400 | 
         
             
                },
         
     | 
| 401 | 
         
             
                {
         
     | 
| 402 | 
         
             
                  "epoch": 2.5310802655401328,
         
     | 
| 403 | 
         
            -
                  "grad_norm": 0. 
     | 
| 404 | 
         
            -
                  "learning_rate": 0. 
     | 
| 405 | 
         
            -
                  "loss": 0. 
     | 
| 406 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 407 | 
         
            -
                  "num_tokens":  
     | 
| 408 | 
         
             
                  "step": 1050
         
     | 
| 409 | 
         
             
                },
         
     | 
| 410 | 
         
             
                {
         
     | 
| 411 | 
         
             
                  "epoch": 2.591430295715148,
         
     | 
| 412 | 
         
            -
                  "grad_norm": 0. 
     | 
| 413 | 
         
            -
                  "learning_rate": 0. 
     | 
| 414 | 
         
            -
                  "loss": 0. 
     | 
| 415 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 416 | 
         
            -
                  "num_tokens":  
     | 
| 417 | 
         
             
                  "step": 1075
         
     | 
| 418 | 
         
             
                },
         
     | 
| 419 | 
         
             
                {
         
     | 
| 420 | 
         
             
                  "epoch": 2.651780325890163,
         
     | 
| 421 | 
         
            -
                  "grad_norm": 0. 
     | 
| 422 | 
         
            -
                  "learning_rate": 0. 
     | 
| 423 | 
         
            -
                  "loss": 0. 
     | 
| 424 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 425 | 
         
            -
                  "num_tokens":  
     | 
| 426 | 
         
             
                  "step": 1100
         
     | 
| 427 | 
         
             
                },
         
     | 
| 428 | 
         
             
                {
         
     | 
| 429 | 
         
             
                  "epoch": 2.712130356065178,
         
     | 
| 430 | 
         
            -
                  "grad_norm": 0. 
     | 
| 431 | 
         
            -
                  "learning_rate": 0. 
     | 
| 432 | 
         
            -
                  "loss": 0. 
     | 
| 433 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 434 | 
         
            -
                  "num_tokens":  
     | 
| 435 | 
         
             
                  "step": 1125
         
     | 
| 436 | 
         
             
                },
         
     | 
| 437 | 
         
             
                {
         
     | 
| 438 | 
         
             
                  "epoch": 2.772480386240193,
         
     | 
| 439 | 
         
            -
                  "grad_norm": 0. 
     | 
| 440 | 
         
            -
                  "learning_rate": 0. 
     | 
| 441 | 
         
            -
                  "loss": 0. 
     | 
| 442 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 443 | 
         
            -
                  "num_tokens":  
     | 
| 444 | 
         
             
                  "step": 1150
         
     | 
| 445 | 
         
             
                },
         
     | 
| 446 | 
         
             
                {
         
     | 
| 447 | 
         
             
                  "epoch": 2.832830416415208,
         
     | 
| 448 | 
         
            -
                  "grad_norm": 0. 
     | 
| 449 | 
         
            -
                  "learning_rate": 0. 
     | 
| 450 | 
         
            -
                  "loss": 0. 
     | 
| 451 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 452 | 
         
            -
                  "num_tokens":  
     | 
| 453 | 
         
             
                  "step": 1175
         
     | 
| 454 | 
         
             
                },
         
     | 
| 455 | 
         
             
                {
         
     | 
| 456 | 
         
             
                  "epoch": 2.8931804465902236,
         
     | 
| 457 | 
         
            -
                  "grad_norm": 0. 
     | 
| 458 | 
         
            -
                  "learning_rate": 0. 
     | 
| 459 | 
         
            -
                  "loss": 0. 
     | 
| 460 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 461 | 
         
            -
                  "num_tokens":  
     | 
| 462 | 
         
             
                  "step": 1200
         
     | 
| 463 | 
         
             
                },
         
     | 
| 464 | 
         
             
                {
         
     | 
| 465 | 
         
             
                  "epoch": 2.9535304767652386,
         
     | 
| 466 | 
         
            -
                  "grad_norm": 0. 
     | 
| 467 | 
         
            -
                  "learning_rate": 0. 
     | 
| 468 | 
         
            -
                  "loss": 0. 
     | 
| 469 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 470 | 
         
            -
                  "num_tokens":  
     | 
| 471 | 
         
             
                  "step": 1225
         
     | 
| 472 | 
         
             
                },
         
     | 
| 473 | 
         
             
                {
         
     | 
| 474 | 
         
             
                  "epoch": 3.0,
         
     | 
| 475 | 
         
            -
                  "eval_loss": 0. 
     | 
| 476 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 477 | 
         
             
                  "eval_num_tokens": 6670539.0,
         
     | 
| 478 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 479 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 480 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 481 | 
         
             
                  "step": 1245
         
     | 
| 482 | 
         
             
                }
         
     | 
| 483 | 
         
             
              ],
         
     | 
| 484 | 
         
             
              "logging_steps": 25,
         
     | 
| 485 | 
         
            -
              "max_steps":  
     | 
| 486 | 
         
             
              "num_input_tokens_seen": 0,
         
     | 
| 487 | 
         
            -
              "num_train_epochs":  
     | 
| 488 | 
         
             
              "save_steps": 500,
         
     | 
| 489 | 
         
             
              "stateful_callbacks": {
         
     | 
| 490 | 
         
             
                "TrainerControl": {
         
     | 
| 
         @@ -498,7 +498,7 @@ 
     | 
|
| 498 | 
         
             
                  "attributes": {}
         
     | 
| 499 | 
         
             
                }
         
     | 
| 500 | 
         
             
              },
         
     | 
| 501 | 
         
            -
              "total_flos": 2. 
     | 
| 502 | 
         
             
              "train_batch_size": 2,
         
     | 
| 503 | 
         
             
              "trial_name": null,
         
     | 
| 504 | 
         
             
              "trial_params": null
         
     | 
| 
         | 
|
| 11 | 
         
             
              "log_history": [
         
     | 
| 12 | 
         
             
                {
         
     | 
| 13 | 
         
             
                  "epoch": 0.060350030175015085,
         
     | 
| 14 | 
         
            +
                  "grad_norm": 0.7244853377342224,
         
     | 
| 15 | 
         
            +
                  "learning_rate": 0.00011428571428571427,
         
     | 
| 16 | 
         
            +
                  "loss": 1.5091,
         
     | 
| 17 | 
         
            +
                  "mean_token_accuracy": 0.6793113535642624,
         
     | 
| 18 | 
         
            +
                  "num_tokens": 152165.0,
         
     | 
| 19 | 
         
             
                  "step": 25
         
     | 
| 20 | 
         
             
                },
         
     | 
| 21 | 
         
             
                {
         
     | 
| 22 | 
         
             
                  "epoch": 0.12070006035003017,
         
     | 
| 23 | 
         
            +
                  "grad_norm": 0.8389242887496948,
         
     | 
| 24 | 
         
            +
                  "learning_rate": 0.0002333333333333333,
         
     | 
| 25 | 
         
            +
                  "loss": 0.8436,
         
     | 
| 26 | 
         
            +
                  "mean_token_accuracy": 0.7881802421808243,
         
     | 
| 27 | 
         
            +
                  "num_tokens": 267390.0,
         
     | 
| 28 | 
         
             
                  "step": 50
         
     | 
| 29 | 
         
             
                },
         
     | 
| 30 | 
         
             
                {
         
     | 
| 31 | 
         
             
                  "epoch": 0.18105009052504525,
         
     | 
| 32 | 
         
            +
                  "grad_norm": 0.6344988942146301,
         
     | 
| 33 | 
         
            +
                  "learning_rate": 0.00029997787517981614,
         
     | 
| 34 | 
         
            +
                  "loss": 0.5527,
         
     | 
| 35 | 
         
            +
                  "mean_token_accuracy": 0.8469069242477417,
         
     | 
| 36 | 
         
            +
                  "num_tokens": 420975.0,
         
     | 
| 37 | 
         
             
                  "step": 75
         
     | 
| 38 | 
         
             
                },
         
     | 
| 39 | 
         
             
                {
         
     | 
| 40 | 
         
             
                  "epoch": 0.24140012070006034,
         
     | 
| 41 | 
         
            +
                  "grad_norm": 0.7947192192077637,
         
     | 
| 42 | 
         
            +
                  "learning_rate": 0.0002997630832860032,
         
     | 
| 43 | 
         
            +
                  "loss": 0.4522,
         
     | 
| 44 | 
         
            +
                  "mean_token_accuracy": 0.870941441655159,
         
     | 
| 45 | 
         
            +
                  "num_tokens": 538380.0,
         
     | 
| 46 | 
         
             
                  "step": 100
         
     | 
| 47 | 
         
             
                },
         
     | 
| 48 | 
         
             
                {
         
     | 
| 49 | 
         
             
                  "epoch": 0.30175015087507545,
         
     | 
| 50 | 
         
            +
                  "grad_norm": 0.43716728687286377,
         
     | 
| 51 | 
         
            +
                  "learning_rate": 0.0002993201135681549,
         
     | 
| 52 | 
         
            +
                  "loss": 0.3049,
         
     | 
| 53 | 
         
            +
                  "mean_token_accuracy": 0.9136220461130142,
         
     | 
| 54 | 
         
            +
                  "num_tokens": 690650.0,
         
     | 
| 55 | 
         
             
                  "step": 125
         
     | 
| 56 | 
         
             
                },
         
     | 
| 57 | 
         
             
                {
         
     | 
| 58 | 
         
             
                  "epoch": 0.3621001810500905,
         
     | 
| 59 | 
         
            +
                  "grad_norm": 1.09097421169281,
         
     | 
| 60 | 
         
            +
                  "learning_rate": 0.0002986496409313553,
         
     | 
| 61 | 
         
            +
                  "loss": 0.3172,
         
     | 
| 62 | 
         
            +
                  "mean_token_accuracy": 0.91127048432827,
         
     | 
| 63 | 
         
            +
                  "num_tokens": 806066.0,
         
     | 
| 64 | 
         
             
                  "step": 150
         
     | 
| 65 | 
         
             
                },
         
     | 
| 66 | 
         
             
                {
         
     | 
| 67 | 
         
             
                  "epoch": 0.4224502112251056,
         
     | 
| 68 | 
         
            +
                  "grad_norm": 0.3773705065250397,
         
     | 
| 69 | 
         
            +
                  "learning_rate": 0.0002977526869022985,
         
     | 
| 70 | 
         
            +
                  "loss": 0.2029,
         
     | 
| 71 | 
         
            +
                  "mean_token_accuracy": 0.9433162885904313,
         
     | 
| 72 | 
         
            +
                  "num_tokens": 960853.0,
         
     | 
| 73 | 
         
             
                  "step": 175
         
     | 
| 74 | 
         
             
                },
         
     | 
| 75 | 
         
             
                {
         
     | 
| 76 | 
         
             
                  "epoch": 0.4828002414001207,
         
     | 
| 77 | 
         
            +
                  "grad_norm": 0.8292771577835083,
         
     | 
| 78 | 
         
            +
                  "learning_rate": 0.0002966306180728982,
         
     | 
| 79 | 
         
            +
                  "loss": 0.2274,
         
     | 
| 80 | 
         
            +
                  "mean_token_accuracy": 0.9385988712310791,
         
     | 
| 81 | 
         
            +
                  "num_tokens": 1077726.0,
         
     | 
| 82 | 
         
             
                  "step": 200
         
     | 
| 83 | 
         
             
                },
         
     | 
| 84 | 
         
             
                {
         
     | 
| 85 | 
         
             
                  "epoch": 0.5431502715751357,
         
     | 
| 86 | 
         
            +
                  "grad_norm": 0.4765889346599579,
         
     | 
| 87 | 
         
            +
                  "learning_rate": 0.0002952851440181598,
         
     | 
| 88 | 
         
            +
                  "loss": 0.19,
         
     | 
| 89 | 
         
            +
                  "mean_token_accuracy": 0.9479016721248626,
         
     | 
| 90 | 
         
            +
                  "num_tokens": 1232263.0,
         
     | 
| 91 | 
         
             
                  "step": 225
         
     | 
| 92 | 
         
             
                },
         
     | 
| 93 | 
         
             
                {
         
     | 
| 94 | 
         
             
                  "epoch": 0.6035003017501509,
         
     | 
| 95 | 
         
            +
                  "grad_norm": 0.9254749417304993,
         
     | 
| 96 | 
         
            +
                  "learning_rate": 0.0002937183146914856,
         
     | 
| 97 | 
         
            +
                  "loss": 0.1826,
         
     | 
| 98 | 
         
            +
                  "mean_token_accuracy": 0.9498224484920502,
         
     | 
| 99 | 
         
            +
                  "num_tokens": 1349057.0,
         
     | 
| 100 | 
         
             
                  "step": 250
         
     | 
| 101 | 
         
             
                },
         
     | 
| 102 | 
         
             
                {
         
     | 
| 103 | 
         
             
                  "epoch": 0.663850331925166,
         
     | 
| 104 | 
         
            +
                  "grad_norm": 0.4938018023967743,
         
     | 
| 105 | 
         
            +
                  "learning_rate": 0.000291932517301382,
         
     | 
| 106 | 
         
            +
                  "loss": 0.1497,
         
     | 
| 107 | 
         
            +
                  "mean_token_accuracy": 0.9588899296522141,
         
     | 
| 108 | 
         
            +
                  "num_tokens": 1496867.0,
         
     | 
| 109 | 
         
             
                  "step": 275
         
     | 
| 110 | 
         
             
                },
         
     | 
| 111 | 
         
             
                {
         
     | 
| 112 | 
         
             
                  "epoch": 0.724200362100181,
         
     | 
| 113 | 
         
            +
                  "grad_norm": 0.6995358467102051,
         
     | 
| 114 | 
         
            +
                  "learning_rate": 0.00028993047267432864,
         
     | 
| 115 | 
         
            +
                  "loss": 0.1578,
         
     | 
| 116 | 
         
            +
                  "mean_token_accuracy": 0.9568761509656906,
         
     | 
| 117 | 
         
            +
                  "num_tokens": 1610727.0,
         
     | 
| 118 | 
         
             
                  "step": 300
         
     | 
| 119 | 
         
             
                },
         
     | 
| 120 | 
         
             
                {
         
     | 
| 121 | 
         
             
                  "epoch": 0.7845503922751962,
         
     | 
| 122 | 
         
            +
                  "grad_norm": 0.46799567341804504,
         
     | 
| 123 | 
         
            +
                  "learning_rate": 0.0002877152311093483,
         
     | 
| 124 | 
         
            +
                  "loss": 0.1351,
         
     | 
| 125 | 
         
            +
                  "mean_token_accuracy": 0.9633717983961105,
         
     | 
| 126 | 
         
            +
                  "num_tokens": 1762041.0,
         
     | 
| 127 | 
         
             
                  "step": 325
         
     | 
| 128 | 
         
             
                },
         
     | 
| 129 | 
         
             
                {
         
     | 
| 130 | 
         
             
                  "epoch": 0.8449004224502112,
         
     | 
| 131 | 
         
            +
                  "grad_norm": 0.6729409098625183,
         
     | 
| 132 | 
         
            +
                  "learning_rate": 0.00028529016773059656,
         
     | 
| 133 | 
         
            +
                  "loss": 0.1206,
         
     | 
| 134 | 
         
            +
                  "mean_token_accuracy": 0.9687577307224273,
         
     | 
| 135 | 
         
            +
                  "num_tokens": 1877965.0,
         
     | 
| 136 | 
         
             
                  "step": 350
         
     | 
| 137 | 
         
             
                },
         
     | 
| 138 | 
         
             
                {
         
     | 
| 139 | 
         
             
                  "epoch": 0.9052504526252263,
         
     | 
| 140 | 
         
            +
                  "grad_norm": 0.5820412635803223,
         
     | 
| 141 | 
         
            +
                  "learning_rate": 0.00028265897734504976,
         
     | 
| 142 | 
         
            +
                  "loss": 0.1183,
         
     | 
| 143 | 
         
            +
                  "mean_token_accuracy": 0.96822787463665,
         
     | 
| 144 | 
         
            +
                  "num_tokens": 2028343.0,
         
     | 
| 145 | 
         
             
                  "step": 375
         
     | 
| 146 | 
         
             
                },
         
     | 
| 147 | 
         
             
                {
         
     | 
| 148 | 
         
             
                  "epoch": 0.9656004828002414,
         
     | 
| 149 | 
         
            +
                  "grad_norm": 0.8604497909545898,
         
     | 
| 150 | 
         
            +
                  "learning_rate": 0.0002798256688131267,
         
     | 
| 151 | 
         
            +
                  "loss": 0.1159,
         
     | 
| 152 | 
         
            +
                  "mean_token_accuracy": 0.9700725018978119,
         
     | 
| 153 | 
         
            +
                  "num_tokens": 2145044.0,
         
     | 
| 154 | 
         
             
                  "step": 400
         
     | 
| 155 | 
         
             
                },
         
     | 
| 156 | 
         
             
                {
         
     | 
| 157 | 
         
             
                  "epoch": 1.0,
         
     | 
| 158 | 
         
            +
                  "eval_loss": 0.1169130727648735,
         
     | 
| 159 | 
         
            +
                  "eval_mean_token_accuracy": 0.9691641559471955,
         
     | 
| 160 | 
         
             
                  "eval_num_tokens": 2223513.0,
         
     | 
| 161 | 
         
            +
                  "eval_runtime": 60.5832,
         
     | 
| 162 | 
         
            +
                  "eval_samples_per_second": 6.091,
         
     | 
| 163 | 
         
            +
                  "eval_steps_per_second": 3.054,
         
     | 
| 164 | 
         
             
                  "step": 415
         
     | 
| 165 | 
         
             
                },
         
     | 
| 166 | 
         
             
                {
         
     | 
| 167 | 
         
             
                  "epoch": 1.024140012070006,
         
     | 
| 168 | 
         
            +
                  "grad_norm": 0.20096616446971893,
         
     | 
| 169 | 
         
            +
                  "learning_rate": 0.0002767945589408217,
         
     | 
| 170 | 
         
            +
                  "loss": 0.122,
         
     | 
| 171 | 
         
            +
                  "mean_token_accuracy": 0.9680000224064306,
         
     | 
| 172 | 
         
            +
                  "num_tokens": 2291746.0,
         
     | 
| 173 | 
         
             
                  "step": 425
         
     | 
| 174 | 
         
             
                },
         
     | 
| 175 | 
         
             
                {
         
     | 
| 176 | 
         
             
                  "epoch": 1.0844900422450212,
         
     | 
| 177 | 
         
            +
                  "grad_norm": 0.34665247797966003,
         
     | 
| 178 | 
         
            +
                  "learning_rate": 0.0002735702659026533,
         
     | 
| 179 | 
         
            +
                  "loss": 0.0836,
         
     | 
| 180 | 
         
            +
                  "mean_token_accuracy": 0.9780776232481003,
         
     | 
| 181 | 
         
            +
                  "num_tokens": 2424528.0,
         
     | 
| 182 | 
         
             
                  "step": 450
         
     | 
| 183 | 
         
             
                },
         
     | 
| 184 | 
         
             
                {
         
     | 
| 185 | 
         
             
                  "epoch": 1.1448400724200363,
         
     | 
| 186 | 
         
            +
                  "grad_norm": 0.30349963903427124,
         
     | 
| 187 | 
         
            +
                  "learning_rate": 0.0002701577022054515,
         
     | 
| 188 | 
         
            +
                  "loss": 0.1019,
         
     | 
| 189 | 
         
            +
                  "mean_token_accuracy": 0.9732917118072509,
         
     | 
| 190 | 
         
            +
                  "num_tokens": 2557091.0,
         
     | 
| 191 | 
         
             
                  "step": 475
         
     | 
| 192 | 
         
             
                },
         
     | 
| 193 | 
         
             
                {
         
     | 
| 194 | 
         
             
                  "epoch": 1.2051901025950513,
         
     | 
| 195 | 
         
            +
                  "grad_norm": 0.3892677426338196,
         
     | 
| 196 | 
         
            +
                  "learning_rate": 0.0002665620672037014,
         
     | 
| 197 | 
         
            +
                  "loss": 0.0831,
         
     | 
| 198 | 
         
            +
                  "mean_token_accuracy": 0.9782004028558731,
         
     | 
| 199 | 
         
            +
                  "num_tokens": 2691527.0,
         
     | 
| 200 | 
         
             
                  "step": 500
         
     | 
| 201 | 
         
             
                },
         
     | 
| 202 | 
         
             
                {
         
     | 
| 203 | 
         
             
                  "epoch": 1.2655401327700664,
         
     | 
| 204 | 
         
            +
                  "grad_norm": 0.29889699816703796,
         
     | 
| 205 | 
         
            +
                  "learning_rate": 0.0002627888391778493,
         
     | 
| 206 | 
         
            +
                  "loss": 0.1023,
         
     | 
| 207 | 
         
            +
                  "mean_token_accuracy": 0.9729781967401504,
         
     | 
| 208 | 
         
            +
                  "num_tokens": 2824699.0,
         
     | 
| 209 | 
         
             
                  "step": 525
         
     | 
| 210 | 
         
             
                },
         
     | 
| 211 | 
         
             
                {
         
     | 
| 212 | 
         
             
                  "epoch": 1.3258901629450814,
         
     | 
| 213 | 
         
            +
                  "grad_norm": 0.393573522567749,
         
     | 
| 214 | 
         
            +
                  "learning_rate": 0.0002588437669876384,
         
     | 
| 215 | 
         
            +
                  "loss": 0.0779,
         
     | 
| 216 | 
         
            +
                  "mean_token_accuracy": 0.9795191860198975,
         
     | 
| 217 | 
         
            +
                  "num_tokens": 2958826.0,
         
     | 
| 218 | 
         
             
                  "step": 550
         
     | 
| 219 | 
         
             
                },
         
     | 
| 220 | 
         
             
                {
         
     | 
| 221 | 
         
             
                  "epoch": 1.3862401931200965,
         
     | 
| 222 | 
         
            +
                  "grad_norm": 0.26299118995666504,
         
     | 
| 223 | 
         
            +
                  "learning_rate": 0.00025473286131319283,
         
     | 
| 224 | 
         
            +
                  "loss": 0.0988,
         
     | 
| 225 | 
         
            +
                  "mean_token_accuracy": 0.9739746767282486,
         
     | 
| 226 | 
         
            +
                  "num_tokens": 3092320.0,
         
     | 
| 227 | 
         
             
                  "step": 575
         
     | 
| 228 | 
         
             
                },
         
     | 
| 229 | 
         
             
                {
         
     | 
| 230 | 
         
             
                  "epoch": 1.4465902232951118,
         
     | 
| 231 | 
         
            +
                  "grad_norm": 0.3649594783782959,
         
     | 
| 232 | 
         
            +
                  "learning_rate": 0.0002504623854971937,
         
     | 
| 233 | 
         
            +
                  "loss": 0.0729,
         
     | 
| 234 | 
         
            +
                  "mean_token_accuracy": 0.9814109367132187,
         
     | 
| 235 | 
         
            +
                  "num_tokens": 3227452.0,
         
     | 
| 236 | 
         
             
                  "step": 600
         
     | 
| 237 | 
         
             
                },
         
     | 
| 238 | 
         
             
                {
         
     | 
| 239 | 
         
             
                  "epoch": 1.5069402534701268,
         
     | 
| 240 | 
         
            +
                  "grad_norm": 0.28632357716560364,
         
     | 
| 241 | 
         
            +
                  "learning_rate": 0.00024603884600210097,
         
     | 
| 242 | 
         
            +
                  "loss": 0.0957,
         
     | 
| 243 | 
         
            +
                  "mean_token_accuracy": 0.9748889011144638,
         
     | 
| 244 | 
         
            +
                  "num_tokens": 3361210.0,
         
     | 
| 245 | 
         
             
                  "step": 625
         
     | 
| 246 | 
         
             
                },
         
     | 
| 247 | 
         
             
                {
         
     | 
| 248 | 
         
             
                  "epoch": 1.567290283645142,
         
     | 
| 249 | 
         
            +
                  "grad_norm": 0.25492990016937256,
         
     | 
| 250 | 
         
            +
                  "learning_rate": 0.00024146898249695974,
         
     | 
| 251 | 
         
            +
                  "loss": 0.075,
         
     | 
| 252 | 
         
            +
                  "mean_token_accuracy": 0.9806595808267593,
         
     | 
| 253 | 
         
            +
                  "num_tokens": 3497177.0,
         
     | 
| 254 | 
         
             
                  "step": 650
         
     | 
| 255 | 
         
             
                },
         
     | 
| 256 | 
         
             
                {
         
     | 
| 257 | 
         
             
                  "epoch": 1.627640313820157,
         
     | 
| 258 | 
         
            +
                  "grad_norm": 0.37043872475624084,
         
     | 
| 259 | 
         
            +
                  "learning_rate": 0.00023675975758889506,
         
     | 
| 260 | 
         
            +
                  "loss": 0.0918,
         
     | 
| 261 | 
         
            +
                  "mean_token_accuracy": 0.9762868732213974,
         
     | 
| 262 | 
         
            +
                  "num_tokens": 3630834.0,
         
     | 
| 263 | 
         
             
                  "step": 675
         
     | 
| 264 | 
         
             
                },
         
     | 
| 265 | 
         
             
                {
         
     | 
| 266 | 
         
             
                  "epoch": 1.687990343995172,
         
     | 
| 267 | 
         
            +
                  "grad_norm": 0.26372411847114563,
         
     | 
| 268 | 
         
            +
                  "learning_rate": 0.00023191834621493968,
         
     | 
| 269 | 
         
            +
                  "loss": 0.0674,
         
     | 
| 270 | 
         
            +
                  "mean_token_accuracy": 0.9826526433229447,
         
     | 
| 271 | 
         
            +
                  "num_tokens": 3766598.0,
         
     | 
| 272 | 
         
             
                  "step": 700
         
     | 
| 273 | 
         
             
                },
         
     | 
| 274 | 
         
             
                {
         
     | 
| 275 | 
         
             
                  "epoch": 1.748340374170187,
         
     | 
| 276 | 
         
            +
                  "grad_norm": 0.2400335669517517,
         
     | 
| 277 | 
         
            +
                  "learning_rate": 0.00022695212471035816,
         
     | 
| 278 | 
         
            +
                  "loss": 0.0807,
         
     | 
| 279 | 
         
            +
                  "mean_token_accuracy": 0.9793906199932099,
         
     | 
| 280 | 
         
            +
                  "num_tokens": 3899644.0,
         
     | 
| 281 | 
         
             
                  "step": 725
         
     | 
| 282 | 
         
             
                },
         
     | 
| 283 | 
         
             
                {
         
     | 
| 284 | 
         
             
                  "epoch": 1.8086904043452021,
         
     | 
| 285 | 
         
            +
                  "grad_norm": 0.19833268225193024,
         
     | 
| 286 | 
         
            +
                  "learning_rate": 0.0002218686595701219,
         
     | 
| 287 | 
         
            +
                  "loss": 0.0655,
         
     | 
| 288 | 
         
            +
                  "mean_token_accuracy": 0.9832920217514038,
         
     | 
| 289 | 
         
            +
                  "num_tokens": 4036037.0,
         
     | 
| 290 | 
         
             
                  "step": 750
         
     | 
| 291 | 
         
             
                },
         
     | 
| 292 | 
         
             
                {
         
     | 
| 293 | 
         
             
                  "epoch": 1.8690404345202172,
         
     | 
| 294 | 
         
            +
                  "grad_norm": 0.17969554662704468,
         
     | 
| 295 | 
         
            +
                  "learning_rate": 0.0002166756959206587,
         
     | 
| 296 | 
         
            +
                  "loss": 0.0831,
         
     | 
| 297 | 
         
            +
                  "mean_token_accuracy": 0.9791438663005829,
         
     | 
| 298 | 
         
            +
                  "num_tokens": 4168035.0,
         
     | 
| 299 | 
         
             
                  "step": 775
         
     | 
| 300 | 
         
             
                },
         
     | 
| 301 | 
         
             
                {
         
     | 
| 302 | 
         
             
                  "epoch": 1.9293904646952322,
         
     | 
| 303 | 
         
            +
                  "grad_norm": 0.3069966733455658,
         
     | 
| 304 | 
         
            +
                  "learning_rate": 0.00021138114571944054,
         
     | 
| 305 | 
         
            +
                  "loss": 0.0624,
         
     | 
| 306 | 
         
            +
                  "mean_token_accuracy": 0.9839604765176773,
         
     | 
| 307 | 
         
            +
                  "num_tokens": 4302324.0,
         
     | 
| 308 | 
         
             
                  "step": 800
         
     | 
| 309 | 
         
             
                },
         
     | 
| 310 | 
         
             
                {
         
     | 
| 311 | 
         
             
                  "epoch": 1.9897404948702473,
         
     | 
| 312 | 
         
            +
                  "grad_norm": 0.26080530881881714,
         
     | 
| 313 | 
         
            +
                  "learning_rate": 0.000205993075700389,
         
     | 
| 314 | 
         
            +
                  "loss": 0.0728,
         
     | 
| 315 | 
         
            +
                  "mean_token_accuracy": 0.9816776049137116,
         
     | 
| 316 | 
         
            +
                  "num_tokens": 4428521.0,
         
     | 
| 317 | 
         
             
                  "step": 825
         
     | 
| 318 | 
         
             
                },
         
     | 
| 319 | 
         
             
                {
         
     | 
| 320 | 
         
             
                  "epoch": 2.0,
         
     | 
| 321 | 
         
            +
                  "eval_loss": 0.07739538699388504,
         
     | 
| 322 | 
         
            +
                  "eval_mean_token_accuracy": 0.9806474750106399,
         
     | 
| 323 | 
         
             
                  "eval_num_tokens": 4447026.0,
         
     | 
| 324 | 
         
            +
                  "eval_runtime": 60.6735,
         
     | 
| 325 | 
         
            +
                  "eval_samples_per_second": 6.082,
         
     | 
| 326 | 
         
            +
                  "eval_steps_per_second": 3.049,
         
     | 
| 327 | 
         
             
                  "step": 830
         
     | 
| 328 | 
         
             
                },
         
     | 
| 329 | 
         
             
                {
         
     | 
| 330 | 
         
             
                  "epoch": 2.048280024140012,
         
     | 
| 331 | 
         
            +
                  "grad_norm": 0.32912909984588623,
         
     | 
| 332 | 
         
            +
                  "learning_rate": 0.00020051969508346498,
         
     | 
| 333 | 
         
            +
                  "loss": 0.0624,
         
     | 
| 334 | 
         
            +
                  "mean_token_accuracy": 0.98369190680612,
         
     | 
| 335 | 
         
            +
                  "num_tokens": 4571335.0,
         
     | 
| 336 | 
         
             
                  "step": 850
         
     | 
| 337 | 
         
             
                },
         
     | 
| 338 | 
         
             
                {
         
     | 
| 339 | 
         
             
                  "epoch": 2.1086300543150274,
         
     | 
| 340 | 
         
            +
                  "grad_norm": 0.22884123027324677,
         
     | 
| 341 | 
         
            +
                  "learning_rate": 0.00019496934306716706,
         
     | 
| 342 | 
         
            +
                  "loss": 0.0543,
         
     | 
| 343 | 
         
            +
                  "mean_token_accuracy": 0.9862597143650055,
         
     | 
| 344 | 
         
            +
                  "num_tokens": 4694373.0,
         
     | 
| 345 | 
         
             
                  "step": 875
         
     | 
| 346 | 
         
             
                },
         
     | 
| 347 | 
         
             
                {
         
     | 
| 348 | 
         
             
                  "epoch": 2.1689800844900424,
         
     | 
| 349 | 
         
            +
                  "grad_norm": 0.15646718442440033,
         
     | 
| 350 | 
         
            +
                  "learning_rate": 0.00018935047612299625,
         
     | 
| 351 | 
         
            +
                  "loss": 0.0683,
         
     | 
| 352 | 
         
            +
                  "mean_token_accuracy": 0.9817469125986099,
         
     | 
| 353 | 
         
            +
                  "num_tokens": 4840032.0,
         
     | 
| 354 | 
         
             
                  "step": 900
         
     | 
| 355 | 
         
             
                },
         
     | 
| 356 | 
         
             
                {
         
     | 
| 357 | 
         
             
                  "epoch": 2.2293301146650575,
         
     | 
| 358 | 
         
            +
                  "grad_norm": 0.32684165239334106,
         
     | 
| 359 | 
         
            +
                  "learning_rate": 0.00018367165511124414,
         
     | 
| 360 | 
         
            +
                  "loss": 0.0558,
         
     | 
| 361 | 
         
            +
                  "mean_token_accuracy": 0.9862085193395614,
         
     | 
| 362 | 
         
            +
                  "num_tokens": 4962900.0,
         
     | 
| 363 | 
         
             
                  "step": 925
         
     | 
| 364 | 
         
             
                },
         
     | 
| 365 | 
         
             
                {
         
     | 
| 366 | 
         
             
                  "epoch": 2.2896801448400725,
         
     | 
| 367 | 
         
            +
                  "grad_norm": 0.15353620052337646,
         
     | 
| 368 | 
         
            +
                  "learning_rate": 0.00017794153223773558,
         
     | 
| 369 | 
         
            +
                  "loss": 0.0649,
         
     | 
| 370 | 
         
            +
                  "mean_token_accuracy": 0.9830775827169418,
         
     | 
| 371 | 
         
            +
                  "num_tokens": 5107775.0,
         
     | 
| 372 | 
         
             
                  "step": 950
         
     | 
| 373 | 
         
             
                },
         
     | 
| 374 | 
         
             
                {
         
     | 
| 375 | 
         
             
                  "epoch": 2.3500301750150876,
         
     | 
| 376 | 
         
            +
                  "grad_norm": 0.13864906132221222,
         
     | 
| 377 | 
         
            +
                  "learning_rate": 0.00017216883787139772,
         
     | 
| 378 | 
         
            +
                  "loss": 0.0513,
         
     | 
| 379 | 
         
            +
                  "mean_token_accuracy": 0.9871918082237243,
         
     | 
| 380 | 
         
            +
                  "num_tokens": 5231159.0,
         
     | 
| 381 | 
         
             
                  "step": 975
         
     | 
| 382 | 
         
             
                },
         
     | 
| 383 | 
         
             
                {
         
     | 
| 384 | 
         
             
                  "epoch": 2.4103802051901027,
         
     | 
| 385 | 
         
            +
                  "grad_norm": 0.18856066465377808,
         
     | 
| 386 | 
         
            +
                  "learning_rate": 0.00016636236724274,
         
     | 
| 387 | 
         
            +
                  "loss": 0.0653,
         
     | 
| 388 | 
         
            +
                  "mean_token_accuracy": 0.9824860644340515,
         
     | 
| 389 | 
         
            +
                  "num_tokens": 5375658.0,
         
     | 
| 390 | 
         
             
                  "step": 1000
         
     | 
| 391 | 
         
             
                },
         
     | 
| 392 | 
         
             
                {
         
     | 
| 393 | 
         
             
                  "epoch": 2.4707302353651177,
         
     | 
| 394 | 
         
            +
                  "grad_norm": 0.1747666597366333,
         
     | 
| 395 | 
         
            +
                  "learning_rate": 0.00016053096704351255,
         
     | 
| 396 | 
         
            +
                  "loss": 0.0536,
         
     | 
| 397 | 
         
            +
                  "mean_token_accuracy": 0.9870379114151001,
         
     | 
| 398 | 
         
            +
                  "num_tokens": 5498792.0,
         
     | 
| 399 | 
         
             
                  "step": 1025
         
     | 
| 400 | 
         
             
                },
         
     | 
| 401 | 
         
             
                {
         
     | 
| 402 | 
         
             
                  "epoch": 2.5310802655401328,
         
     | 
| 403 | 
         
            +
                  "grad_norm": 0.08616527169942856,
         
     | 
| 404 | 
         
            +
                  "learning_rate": 0.00015468352194795791,
         
     | 
| 405 | 
         
            +
                  "loss": 0.0605,
         
     | 
| 406 | 
         
            +
                  "mean_token_accuracy": 0.9837486296892166,
         
     | 
| 407 | 
         
            +
                  "num_tokens": 5644155.0,
         
     | 
| 408 | 
         
             
                  "step": 1050
         
     | 
| 409 | 
         
             
                },
         
     | 
| 410 | 
         
             
                {
         
     | 
| 411 | 
         
             
                  "epoch": 2.591430295715148,
         
     | 
| 412 | 
         
            +
                  "grad_norm": 0.21047131717205048,
         
     | 
| 413 | 
         
            +
                  "learning_rate": 0.00014882894107619277,
         
     | 
| 414 | 
         
            +
                  "loss": 0.0502,
         
     | 
| 415 | 
         
            +
                  "mean_token_accuracy": 0.9874639976024627,
         
     | 
| 416 | 
         
            +
                  "num_tokens": 5768255.0,
         
     | 
| 417 | 
         
             
                  "step": 1075
         
     | 
| 418 | 
         
             
                },
         
     | 
| 419 | 
         
             
                {
         
     | 
| 420 | 
         
             
                  "epoch": 2.651780325890163,
         
     | 
| 421 | 
         
            +
                  "grad_norm": 0.09520892798900604,
         
     | 
| 422 | 
         
            +
                  "learning_rate": 0.00014297614442034518,
         
     | 
| 423 | 
         
            +
                  "loss": 0.0568,
         
     | 
| 424 | 
         
            +
                  "mean_token_accuracy": 0.9851021945476532,
         
     | 
| 425 | 
         
            +
                  "num_tokens": 5913228.0,
         
     | 
| 426 | 
         
             
                  "step": 1100
         
     | 
| 427 | 
         
             
                },
         
     | 
| 428 | 
         
             
                {
         
     | 
| 429 | 
         
             
                  "epoch": 2.712130356065178,
         
     | 
| 430 | 
         
            +
                  "grad_norm": 0.11644323915243149,
         
     | 
| 431 | 
         
            +
                  "learning_rate": 0.000137134049254126,
         
     | 
| 432 | 
         
            +
                  "loss": 0.0523,
         
     | 
| 433 | 
         
            +
                  "mean_token_accuracy": 0.9867914581298828,
         
     | 
| 434 | 
         
            +
                  "num_tokens": 6037285.0,
         
     | 
| 435 | 
         
             
                  "step": 1125
         
     | 
| 436 | 
         
             
                },
         
     | 
| 437 | 
         
             
                {
         
     | 
| 438 | 
         
             
                  "epoch": 2.772480386240193,
         
     | 
| 439 | 
         
            +
                  "grad_norm": 0.12872624397277832,
         
     | 
| 440 | 
         
            +
                  "learning_rate": 0.000131311556546543,
         
     | 
| 441 | 
         
            +
                  "loss": 0.0563,
         
     | 
| 442 | 
         
            +
                  "mean_token_accuracy": 0.9849929654598236,
         
     | 
| 443 | 
         
            +
                  "num_tokens": 6183361.0,
         
     | 
| 444 | 
         
             
                  "step": 1150
         
     | 
| 445 | 
         
             
                },
         
     | 
| 446 | 
         
             
                {
         
     | 
| 447 | 
         
             
                  "epoch": 2.832830416415208,
         
     | 
| 448 | 
         
            +
                  "grad_norm": 0.10195529460906982,
         
     | 
| 449 | 
         
            +
                  "learning_rate": 0.0001255175374004563,
         
     | 
| 450 | 
         
            +
                  "loss": 0.0501,
         
     | 
| 451 | 
         
            +
                  "mean_token_accuracy": 0.9871714848279953,
         
     | 
| 452 | 
         
            +
                  "num_tokens": 6305713.0,
         
     | 
| 453 | 
         
             
                  "step": 1175
         
     | 
| 454 | 
         
             
                },
         
     | 
| 455 | 
         
             
                {
         
     | 
| 456 | 
         
             
                  "epoch": 2.8931804465902236,
         
     | 
| 457 | 
         
            +
                  "grad_norm": 0.09452041983604431,
         
     | 
| 458 | 
         
            +
                  "learning_rate": 0.0001197608195366377,
         
     | 
| 459 | 
         
            +
                  "loss": 0.0581,
         
     | 
| 460 | 
         
            +
                  "mean_token_accuracy": 0.9840293884277344,
         
     | 
| 461 | 
         
            +
                  "num_tokens": 6451719.0,
         
     | 
| 462 | 
         
             
                  "step": 1200
         
     | 
| 463 | 
         
             
                },
         
     | 
| 464 | 
         
             
                {
         
     | 
| 465 | 
         
             
                  "epoch": 2.9535304767652386,
         
     | 
| 466 | 
         
            +
                  "grad_norm": 0.17165224254131317,
         
     | 
| 467 | 
         
            +
                  "learning_rate": 0.00011405017384392655,
         
     | 
| 468 | 
         
            +
                  "loss": 0.049,
         
     | 
| 469 | 
         
            +
                  "mean_token_accuracy": 0.9875269651412963,
         
     | 
| 470 | 
         
            +
                  "num_tokens": 6575211.0,
         
     | 
| 471 | 
         
             
                  "step": 1225
         
     | 
| 472 | 
         
             
                },
         
     | 
| 473 | 
         
             
                {
         
     | 
| 474 | 
         
             
                  "epoch": 3.0,
         
     | 
| 475 | 
         
            +
                  "eval_loss": 0.06446010619401932,
         
     | 
| 476 | 
         
            +
                  "eval_mean_token_accuracy": 0.9841987928828677,
         
     | 
| 477 | 
         
             
                  "eval_num_tokens": 6670539.0,
         
     | 
| 478 | 
         
            +
                  "eval_runtime": 60.4296,
         
     | 
| 479 | 
         
            +
                  "eval_samples_per_second": 6.106,
         
     | 
| 480 | 
         
            +
                  "eval_steps_per_second": 3.061,
         
     | 
| 481 | 
         
             
                  "step": 1245
         
     | 
| 482 | 
         
             
                }
         
     | 
| 483 | 
         
             
              ],
         
     | 
| 484 | 
         
             
              "logging_steps": 25,
         
     | 
| 485 | 
         
            +
              "max_steps": 2075,
         
     | 
| 486 | 
         
             
              "num_input_tokens_seen": 0,
         
     | 
| 487 | 
         
            +
              "num_train_epochs": 5,
         
     | 
| 488 | 
         
             
              "save_steps": 500,
         
     | 
| 489 | 
         
             
              "stateful_callbacks": {
         
     | 
| 490 | 
         
             
                "TrainerControl": {
         
     | 
| 
         | 
|
| 498 | 
         
             
                  "attributes": {}
         
     | 
| 499 | 
         
             
                }
         
     | 
| 500 | 
         
             
              },
         
     | 
| 501 | 
         
            +
              "total_flos": 2.8849238785951334e+17,
         
     | 
| 502 | 
         
             
              "train_batch_size": 2,
         
     | 
| 503 | 
         
             
              "trial_name": null,
         
     | 
| 504 | 
         
             
              "trial_params": null
         
     | 
    	
        checkpoint-1245/training_args.bin
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 6033
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:51ece4ed6b1462de05ca804e04b783f884883c31cae5c545b5f19f6192d34a62
         
     | 
| 3 | 
         
             
            size 6033
         
     | 
    	
        checkpoint-1660/adapter_config.json
    CHANGED
    
    | 
         @@ -25,12 +25,12 @@ 
     | 
|
| 25 | 
         
             
              "revision": null,
         
     | 
| 26 | 
         
             
              "target_modules": [
         
     | 
| 27 | 
         
             
                "gate_proj",
         
     | 
| 28 | 
         
            -
                " 
     | 
| 
         | 
|
| 29 | 
         
             
                "down_proj",
         
     | 
| 30 | 
         
             
                "o_proj",
         
     | 
| 31 | 
         
            -
                " 
     | 
| 32 | 
         
            -
                "up_proj" 
     | 
| 33 | 
         
            -
                "q_proj"
         
     | 
| 34 | 
         
             
              ],
         
     | 
| 35 | 
         
             
              "task_type": "CAUSAL_LM",
         
     | 
| 36 | 
         
             
              "trainable_token_indices": null,
         
     | 
| 
         | 
|
| 25 | 
         
             
              "revision": null,
         
     | 
| 26 | 
         
             
              "target_modules": [
         
     | 
| 27 | 
         
             
                "gate_proj",
         
     | 
| 28 | 
         
            +
                "v_proj",
         
     | 
| 29 | 
         
            +
                "q_proj",
         
     | 
| 30 | 
         
             
                "down_proj",
         
     | 
| 31 | 
         
             
                "o_proj",
         
     | 
| 32 | 
         
            +
                "k_proj",
         
     | 
| 33 | 
         
            +
                "up_proj"
         
     | 
| 
         | 
|
| 34 | 
         
             
              ],
         
     | 
| 35 | 
         
             
              "task_type": "CAUSAL_LM",
         
     | 
| 36 | 
         
             
              "trainable_token_indices": null,
         
     | 
    	
        checkpoint-1660/adapter_model.safetensors
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 335604696
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:6f5f062a3b1d396cab72a4843f57977bef358f35b36958af4d5522f5966474f7
         
     | 
| 3 | 
         
             
            size 335604696
         
     | 
    	
        checkpoint-1660/optimizer.pt
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 671365003
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:a9b5f237d988a6fa0489e53b086f6bce4c9b8cdc57857f9076af9bc39586d69a
         
     | 
| 3 | 
         
             
            size 671365003
         
     | 
    	
        checkpoint-1660/rng_state.pth
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 14645
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:24c257370cee13a6968997868cebfac93221271dce2af51696e836773fa63c52
         
     | 
| 3 | 
         
             
            size 14645
         
     | 
    	
        checkpoint-1660/scheduler.pt
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 1465
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:db489836d67eb6a2f9cf352ac12b33db8531786c612f502c376b4d1f7cd034a4
         
     | 
| 3 | 
         
             
            size 1465
         
     | 
    	
        checkpoint-1660/trainer_state.json
    CHANGED
    
    | 
         @@ -11,643 +11,643 @@ 
     | 
|
| 11 | 
         
             
              "log_history": [
         
     | 
| 12 | 
         
             
                {
         
     | 
| 13 | 
         
             
                  "epoch": 0.060350030175015085,
         
     | 
| 14 | 
         
            -
                  "grad_norm": 0. 
     | 
| 15 | 
         
            -
                  "learning_rate":  
     | 
| 16 | 
         
            -
                  "loss": 1. 
     | 
| 17 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 18 | 
         
            -
                  "num_tokens":  
     | 
| 19 | 
         
             
                  "step": 25
         
     | 
| 20 | 
         
             
                },
         
     | 
| 21 | 
         
             
                {
         
     | 
| 22 | 
         
             
                  "epoch": 0.12070006035003017,
         
     | 
| 23 | 
         
            -
                  "grad_norm": 0. 
     | 
| 24 | 
         
            -
                  "learning_rate": 0. 
     | 
| 25 | 
         
            -
                  "loss": 0. 
     | 
| 26 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 27 | 
         
            -
                  "num_tokens":  
     | 
| 28 | 
         
             
                  "step": 50
         
     | 
| 29 | 
         
             
                },
         
     | 
| 30 | 
         
             
                {
         
     | 
| 31 | 
         
             
                  "epoch": 0.18105009052504525,
         
     | 
| 32 | 
         
            -
                  "grad_norm": 0. 
     | 
| 33 | 
         
            -
                  "learning_rate": 0. 
     | 
| 34 | 
         
            -
                  "loss": 0. 
     | 
| 35 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 36 | 
         
            -
                  "num_tokens":  
     | 
| 37 | 
         
             
                  "step": 75
         
     | 
| 38 | 
         
             
                },
         
     | 
| 39 | 
         
             
                {
         
     | 
| 40 | 
         
             
                  "epoch": 0.24140012070006034,
         
     | 
| 41 | 
         
            -
                  "grad_norm": 0. 
     | 
| 42 | 
         
            -
                  "learning_rate": 0. 
     | 
| 43 | 
         
            -
                  "loss": 0. 
     | 
| 44 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 45 | 
         
            -
                  "num_tokens":  
     | 
| 46 | 
         
             
                  "step": 100
         
     | 
| 47 | 
         
             
                },
         
     | 
| 48 | 
         
             
                {
         
     | 
| 49 | 
         
             
                  "epoch": 0.30175015087507545,
         
     | 
| 50 | 
         
            -
                  "grad_norm": 0. 
     | 
| 51 | 
         
            -
                  "learning_rate": 0. 
     | 
| 52 | 
         
            -
                  "loss": 0. 
     | 
| 53 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 54 | 
         
            -
                  "num_tokens":  
     | 
| 55 | 
         
             
                  "step": 125
         
     | 
| 56 | 
         
             
                },
         
     | 
| 57 | 
         
             
                {
         
     | 
| 58 | 
         
             
                  "epoch": 0.3621001810500905,
         
     | 
| 59 | 
         
            -
                  "grad_norm":  
     | 
| 60 | 
         
            -
                  "learning_rate": 0. 
     | 
| 61 | 
         
            -
                  "loss": 0. 
     | 
| 62 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 63 | 
         
            -
                  "num_tokens":  
     | 
| 64 | 
         
             
                  "step": 150
         
     | 
| 65 | 
         
             
                },
         
     | 
| 66 | 
         
             
                {
         
     | 
| 67 | 
         
             
                  "epoch": 0.4224502112251056,
         
     | 
| 68 | 
         
            -
                  "grad_norm": 0. 
     | 
| 69 | 
         
            -
                  "learning_rate": 0. 
     | 
| 70 | 
         
            -
                  "loss": 0. 
     | 
| 71 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 72 | 
         
            -
                  "num_tokens":  
     | 
| 73 | 
         
             
                  "step": 175
         
     | 
| 74 | 
         
             
                },
         
     | 
| 75 | 
         
             
                {
         
     | 
| 76 | 
         
             
                  "epoch": 0.4828002414001207,
         
     | 
| 77 | 
         
            -
                  "grad_norm": 0. 
     | 
| 78 | 
         
            -
                  "learning_rate": 0. 
     | 
| 79 | 
         
            -
                  "loss": 0. 
     | 
| 80 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 81 | 
         
            -
                  "num_tokens":  
     | 
| 82 | 
         
             
                  "step": 200
         
     | 
| 83 | 
         
             
                },
         
     | 
| 84 | 
         
             
                {
         
     | 
| 85 | 
         
             
                  "epoch": 0.5431502715751357,
         
     | 
| 86 | 
         
            -
                  "grad_norm": 0. 
     | 
| 87 | 
         
            -
                  "learning_rate": 0. 
     | 
| 88 | 
         
            -
                  "loss": 0. 
     | 
| 89 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 90 | 
         
            -
                  "num_tokens":  
     | 
| 91 | 
         
             
                  "step": 225
         
     | 
| 92 | 
         
             
                },
         
     | 
| 93 | 
         
             
                {
         
     | 
| 94 | 
         
             
                  "epoch": 0.6035003017501509,
         
     | 
| 95 | 
         
            -
                  "grad_norm": 0. 
     | 
| 96 | 
         
            -
                  "learning_rate": 0. 
     | 
| 97 | 
         
            -
                  "loss": 0. 
     | 
| 98 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 99 | 
         
            -
                  "num_tokens":  
     | 
| 100 | 
         
             
                  "step": 250
         
     | 
| 101 | 
         
             
                },
         
     | 
| 102 | 
         
             
                {
         
     | 
| 103 | 
         
             
                  "epoch": 0.663850331925166,
         
     | 
| 104 | 
         
            -
                  "grad_norm": 0. 
     | 
| 105 | 
         
            -
                  "learning_rate": 0. 
     | 
| 106 | 
         
            -
                  "loss": 0. 
     | 
| 107 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 108 | 
         
            -
                  "num_tokens":  
     | 
| 109 | 
         
             
                  "step": 275
         
     | 
| 110 | 
         
             
                },
         
     | 
| 111 | 
         
             
                {
         
     | 
| 112 | 
         
             
                  "epoch": 0.724200362100181,
         
     | 
| 113 | 
         
            -
                  "grad_norm": 0. 
     | 
| 114 | 
         
            -
                  "learning_rate": 0. 
     | 
| 115 | 
         
            -
                  "loss": 0. 
     | 
| 116 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 117 | 
         
            -
                  "num_tokens":  
     | 
| 118 | 
         
             
                  "step": 300
         
     | 
| 119 | 
         
             
                },
         
     | 
| 120 | 
         
             
                {
         
     | 
| 121 | 
         
             
                  "epoch": 0.7845503922751962,
         
     | 
| 122 | 
         
            -
                  "grad_norm": 0. 
     | 
| 123 | 
         
            -
                  "learning_rate": 0. 
     | 
| 124 | 
         
            -
                  "loss": 0. 
     | 
| 125 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 126 | 
         
            -
                  "num_tokens":  
     | 
| 127 | 
         
             
                  "step": 325
         
     | 
| 128 | 
         
             
                },
         
     | 
| 129 | 
         
             
                {
         
     | 
| 130 | 
         
             
                  "epoch": 0.8449004224502112,
         
     | 
| 131 | 
         
            -
                  "grad_norm": 0. 
     | 
| 132 | 
         
            -
                  "learning_rate": 0. 
     | 
| 133 | 
         
            -
                  "loss": 0. 
     | 
| 134 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 135 | 
         
            -
                  "num_tokens":  
     | 
| 136 | 
         
             
                  "step": 350
         
     | 
| 137 | 
         
             
                },
         
     | 
| 138 | 
         
             
                {
         
     | 
| 139 | 
         
             
                  "epoch": 0.9052504526252263,
         
     | 
| 140 | 
         
            -
                  "grad_norm": 0. 
     | 
| 141 | 
         
            -
                  "learning_rate": 0. 
     | 
| 142 | 
         
            -
                  "loss": 0. 
     | 
| 143 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 144 | 
         
            -
                  "num_tokens":  
     | 
| 145 | 
         
             
                  "step": 375
         
     | 
| 146 | 
         
             
                },
         
     | 
| 147 | 
         
             
                {
         
     | 
| 148 | 
         
             
                  "epoch": 0.9656004828002414,
         
     | 
| 149 | 
         
            -
                  "grad_norm": 0. 
     | 
| 150 | 
         
            -
                  "learning_rate": 0. 
     | 
| 151 | 
         
            -
                  "loss": 0. 
     | 
| 152 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 153 | 
         
            -
                  "num_tokens":  
     | 
| 154 | 
         
             
                  "step": 400
         
     | 
| 155 | 
         
             
                },
         
     | 
| 156 | 
         
             
                {
         
     | 
| 157 | 
         
             
                  "epoch": 1.0,
         
     | 
| 158 | 
         
            -
                  "eval_loss": 0. 
     | 
| 159 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 160 | 
         
             
                  "eval_num_tokens": 2223513.0,
         
     | 
| 161 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 162 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 163 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 164 | 
         
             
                  "step": 415
         
     | 
| 165 | 
         
             
                },
         
     | 
| 166 | 
         
             
                {
         
     | 
| 167 | 
         
             
                  "epoch": 1.024140012070006,
         
     | 
| 168 | 
         
            -
                  "grad_norm": 0. 
     | 
| 169 | 
         
            -
                  "learning_rate": 0. 
     | 
| 170 | 
         
            -
                  "loss": 0. 
     | 
| 171 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 172 | 
         
            -
                  "num_tokens":  
     | 
| 173 | 
         
             
                  "step": 425
         
     | 
| 174 | 
         
             
                },
         
     | 
| 175 | 
         
             
                {
         
     | 
| 176 | 
         
             
                  "epoch": 1.0844900422450212,
         
     | 
| 177 | 
         
            -
                  "grad_norm": 0. 
     | 
| 178 | 
         
            -
                  "learning_rate": 0. 
     | 
| 179 | 
         
            -
                  "loss": 0. 
     | 
| 180 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 181 | 
         
            -
                  "num_tokens":  
     | 
| 182 | 
         
             
                  "step": 450
         
     | 
| 183 | 
         
             
                },
         
     | 
| 184 | 
         
             
                {
         
     | 
| 185 | 
         
             
                  "epoch": 1.1448400724200363,
         
     | 
| 186 | 
         
            -
                  "grad_norm": 0. 
     | 
| 187 | 
         
            -
                  "learning_rate": 0. 
     | 
| 188 | 
         
            -
                  "loss": 0. 
     | 
| 189 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 190 | 
         
            -
                  "num_tokens":  
     | 
| 191 | 
         
             
                  "step": 475
         
     | 
| 192 | 
         
             
                },
         
     | 
| 193 | 
         
             
                {
         
     | 
| 194 | 
         
             
                  "epoch": 1.2051901025950513,
         
     | 
| 195 | 
         
            -
                  "grad_norm": 0. 
     | 
| 196 | 
         
            -
                  "learning_rate": 0. 
     | 
| 197 | 
         
            -
                  "loss": 0. 
     | 
| 198 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 199 | 
         
            -
                  "num_tokens":  
     | 
| 200 | 
         
             
                  "step": 500
         
     | 
| 201 | 
         
             
                },
         
     | 
| 202 | 
         
             
                {
         
     | 
| 203 | 
         
             
                  "epoch": 1.2655401327700664,
         
     | 
| 204 | 
         
            -
                  "grad_norm": 0. 
     | 
| 205 | 
         
            -
                  "learning_rate": 0. 
     | 
| 206 | 
         
            -
                  "loss": 0. 
     | 
| 207 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 208 | 
         
            -
                  "num_tokens":  
     | 
| 209 | 
         
             
                  "step": 525
         
     | 
| 210 | 
         
             
                },
         
     | 
| 211 | 
         
             
                {
         
     | 
| 212 | 
         
             
                  "epoch": 1.3258901629450814,
         
     | 
| 213 | 
         
            -
                  "grad_norm": 0. 
     | 
| 214 | 
         
            -
                  "learning_rate": 0. 
     | 
| 215 | 
         
            -
                  "loss": 0. 
     | 
| 216 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 217 | 
         
            -
                  "num_tokens":  
     | 
| 218 | 
         
             
                  "step": 550
         
     | 
| 219 | 
         
             
                },
         
     | 
| 220 | 
         
             
                {
         
     | 
| 221 | 
         
             
                  "epoch": 1.3862401931200965,
         
     | 
| 222 | 
         
            -
                  "grad_norm": 0. 
     | 
| 223 | 
         
            -
                  "learning_rate": 0. 
     | 
| 224 | 
         
            -
                  "loss": 0. 
     | 
| 225 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 226 | 
         
            -
                  "num_tokens":  
     | 
| 227 | 
         
             
                  "step": 575
         
     | 
| 228 | 
         
             
                },
         
     | 
| 229 | 
         
             
                {
         
     | 
| 230 | 
         
             
                  "epoch": 1.4465902232951118,
         
     | 
| 231 | 
         
            -
                  "grad_norm": 0. 
     | 
| 232 | 
         
            -
                  "learning_rate": 0. 
     | 
| 233 | 
         
            -
                  "loss": 0. 
     | 
| 234 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 235 | 
         
            -
                  "num_tokens":  
     | 
| 236 | 
         
             
                  "step": 600
         
     | 
| 237 | 
         
             
                },
         
     | 
| 238 | 
         
             
                {
         
     | 
| 239 | 
         
             
                  "epoch": 1.5069402534701268,
         
     | 
| 240 | 
         
            -
                  "grad_norm": 0. 
     | 
| 241 | 
         
            -
                  "learning_rate": 0. 
     | 
| 242 | 
         
            -
                  "loss": 0. 
     | 
| 243 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 244 | 
         
            -
                  "num_tokens":  
     | 
| 245 | 
         
             
                  "step": 625
         
     | 
| 246 | 
         
             
                },
         
     | 
| 247 | 
         
             
                {
         
     | 
| 248 | 
         
             
                  "epoch": 1.567290283645142,
         
     | 
| 249 | 
         
            -
                  "grad_norm": 0. 
     | 
| 250 | 
         
            -
                  "learning_rate": 0. 
     | 
| 251 | 
         
            -
                  "loss": 0. 
     | 
| 252 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 253 | 
         
            -
                  "num_tokens":  
     | 
| 254 | 
         
             
                  "step": 650
         
     | 
| 255 | 
         
             
                },
         
     | 
| 256 | 
         
             
                {
         
     | 
| 257 | 
         
             
                  "epoch": 1.627640313820157,
         
     | 
| 258 | 
         
            -
                  "grad_norm": 0. 
     | 
| 259 | 
         
            -
                  "learning_rate": 0. 
     | 
| 260 | 
         
            -
                  "loss": 0. 
     | 
| 261 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 262 | 
         
            -
                  "num_tokens":  
     | 
| 263 | 
         
             
                  "step": 675
         
     | 
| 264 | 
         
             
                },
         
     | 
| 265 | 
         
             
                {
         
     | 
| 266 | 
         
             
                  "epoch": 1.687990343995172,
         
     | 
| 267 | 
         
            -
                  "grad_norm": 0. 
     | 
| 268 | 
         
            -
                  "learning_rate": 0. 
     | 
| 269 | 
         
            -
                  "loss": 0. 
     | 
| 270 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 271 | 
         
            -
                  "num_tokens":  
     | 
| 272 | 
         
             
                  "step": 700
         
     | 
| 273 | 
         
             
                },
         
     | 
| 274 | 
         
             
                {
         
     | 
| 275 | 
         
             
                  "epoch": 1.748340374170187,
         
     | 
| 276 | 
         
            -
                  "grad_norm": 0. 
     | 
| 277 | 
         
            -
                  "learning_rate": 0. 
     | 
| 278 | 
         
            -
                  "loss": 0. 
     | 
| 279 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 280 | 
         
            -
                  "num_tokens":  
     | 
| 281 | 
         
             
                  "step": 725
         
     | 
| 282 | 
         
             
                },
         
     | 
| 283 | 
         
             
                {
         
     | 
| 284 | 
         
             
                  "epoch": 1.8086904043452021,
         
     | 
| 285 | 
         
            -
                  "grad_norm": 0. 
     | 
| 286 | 
         
            -
                  "learning_rate": 0. 
     | 
| 287 | 
         
            -
                  "loss": 0. 
     | 
| 288 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 289 | 
         
            -
                  "num_tokens":  
     | 
| 290 | 
         
             
                  "step": 750
         
     | 
| 291 | 
         
             
                },
         
     | 
| 292 | 
         
             
                {
         
     | 
| 293 | 
         
             
                  "epoch": 1.8690404345202172,
         
     | 
| 294 | 
         
            -
                  "grad_norm": 0. 
     | 
| 295 | 
         
            -
                  "learning_rate": 0. 
     | 
| 296 | 
         
            -
                  "loss": 0. 
     | 
| 297 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 298 | 
         
            -
                  "num_tokens":  
     | 
| 299 | 
         
             
                  "step": 775
         
     | 
| 300 | 
         
             
                },
         
     | 
| 301 | 
         
             
                {
         
     | 
| 302 | 
         
             
                  "epoch": 1.9293904646952322,
         
     | 
| 303 | 
         
            -
                  "grad_norm": 0. 
     | 
| 304 | 
         
            -
                  "learning_rate": 0. 
     | 
| 305 | 
         
            -
                  "loss": 0. 
     | 
| 306 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 307 | 
         
            -
                  "num_tokens":  
     | 
| 308 | 
         
             
                  "step": 800
         
     | 
| 309 | 
         
             
                },
         
     | 
| 310 | 
         
             
                {
         
     | 
| 311 | 
         
             
                  "epoch": 1.9897404948702473,
         
     | 
| 312 | 
         
            -
                  "grad_norm": 0. 
     | 
| 313 | 
         
            -
                  "learning_rate": 0. 
     | 
| 314 | 
         
            -
                  "loss": 0. 
     | 
| 315 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 316 | 
         
            -
                  "num_tokens":  
     | 
| 317 | 
         
             
                  "step": 825
         
     | 
| 318 | 
         
             
                },
         
     | 
| 319 | 
         
             
                {
         
     | 
| 320 | 
         
             
                  "epoch": 2.0,
         
     | 
| 321 | 
         
            -
                  "eval_loss": 0. 
     | 
| 322 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 323 | 
         
             
                  "eval_num_tokens": 4447026.0,
         
     | 
| 324 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 325 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 326 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 327 | 
         
             
                  "step": 830
         
     | 
| 328 | 
         
             
                },
         
     | 
| 329 | 
         
             
                {
         
     | 
| 330 | 
         
             
                  "epoch": 2.048280024140012,
         
     | 
| 331 | 
         
            -
                  "grad_norm": 0. 
     | 
| 332 | 
         
            -
                  "learning_rate": 0. 
     | 
| 333 | 
         
            -
                  "loss": 0. 
     | 
| 334 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 335 | 
         
            -
                  "num_tokens":  
     | 
| 336 | 
         
             
                  "step": 850
         
     | 
| 337 | 
         
             
                },
         
     | 
| 338 | 
         
             
                {
         
     | 
| 339 | 
         
             
                  "epoch": 2.1086300543150274,
         
     | 
| 340 | 
         
            -
                  "grad_norm": 0. 
     | 
| 341 | 
         
            -
                  "learning_rate": 0. 
     | 
| 342 | 
         
            -
                  "loss": 0. 
     | 
| 343 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 344 | 
         
            -
                  "num_tokens":  
     | 
| 345 | 
         
             
                  "step": 875
         
     | 
| 346 | 
         
             
                },
         
     | 
| 347 | 
         
             
                {
         
     | 
| 348 | 
         
             
                  "epoch": 2.1689800844900424,
         
     | 
| 349 | 
         
            -
                  "grad_norm": 0. 
     | 
| 350 | 
         
            -
                  "learning_rate": 0. 
     | 
| 351 | 
         
            -
                  "loss": 0. 
     | 
| 352 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 353 | 
         
            -
                  "num_tokens":  
     | 
| 354 | 
         
             
                  "step": 900
         
     | 
| 355 | 
         
             
                },
         
     | 
| 356 | 
         
             
                {
         
     | 
| 357 | 
         
             
                  "epoch": 2.2293301146650575,
         
     | 
| 358 | 
         
            -
                  "grad_norm": 0. 
     | 
| 359 | 
         
            -
                  "learning_rate": 0. 
     | 
| 360 | 
         
            -
                  "loss": 0. 
     | 
| 361 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 362 | 
         
            -
                  "num_tokens":  
     | 
| 363 | 
         
             
                  "step": 925
         
     | 
| 364 | 
         
             
                },
         
     | 
| 365 | 
         
             
                {
         
     | 
| 366 | 
         
             
                  "epoch": 2.2896801448400725,
         
     | 
| 367 | 
         
            -
                  "grad_norm": 0. 
     | 
| 368 | 
         
            -
                  "learning_rate": 0. 
     | 
| 369 | 
         
            -
                  "loss": 0. 
     | 
| 370 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 371 | 
         
            -
                  "num_tokens":  
     | 
| 372 | 
         
             
                  "step": 950
         
     | 
| 373 | 
         
             
                },
         
     | 
| 374 | 
         
             
                {
         
     | 
| 375 | 
         
             
                  "epoch": 2.3500301750150876,
         
     | 
| 376 | 
         
            -
                  "grad_norm": 0. 
     | 
| 377 | 
         
            -
                  "learning_rate": 0. 
     | 
| 378 | 
         
            -
                  "loss": 0. 
     | 
| 379 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 380 | 
         
            -
                  "num_tokens":  
     | 
| 381 | 
         
             
                  "step": 975
         
     | 
| 382 | 
         
             
                },
         
     | 
| 383 | 
         
             
                {
         
     | 
| 384 | 
         
             
                  "epoch": 2.4103802051901027,
         
     | 
| 385 | 
         
            -
                  "grad_norm": 0. 
     | 
| 386 | 
         
            -
                  "learning_rate": 0. 
     | 
| 387 | 
         
            -
                  "loss": 0. 
     | 
| 388 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 389 | 
         
            -
                  "num_tokens":  
     | 
| 390 | 
         
             
                  "step": 1000
         
     | 
| 391 | 
         
             
                },
         
     | 
| 392 | 
         
             
                {
         
     | 
| 393 | 
         
             
                  "epoch": 2.4707302353651177,
         
     | 
| 394 | 
         
            -
                  "grad_norm": 0. 
     | 
| 395 | 
         
            -
                  "learning_rate": 0. 
     | 
| 396 | 
         
            -
                  "loss": 0. 
     | 
| 397 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 398 | 
         
            -
                  "num_tokens":  
     | 
| 399 | 
         
             
                  "step": 1025
         
     | 
| 400 | 
         
             
                },
         
     | 
| 401 | 
         
             
                {
         
     | 
| 402 | 
         
             
                  "epoch": 2.5310802655401328,
         
     | 
| 403 | 
         
            -
                  "grad_norm": 0. 
     | 
| 404 | 
         
            -
                  "learning_rate": 0. 
     | 
| 405 | 
         
            -
                  "loss": 0. 
     | 
| 406 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 407 | 
         
            -
                  "num_tokens":  
     | 
| 408 | 
         
             
                  "step": 1050
         
     | 
| 409 | 
         
             
                },
         
     | 
| 410 | 
         
             
                {
         
     | 
| 411 | 
         
             
                  "epoch": 2.591430295715148,
         
     | 
| 412 | 
         
            -
                  "grad_norm": 0. 
     | 
| 413 | 
         
            -
                  "learning_rate": 0. 
     | 
| 414 | 
         
            -
                  "loss": 0. 
     | 
| 415 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 416 | 
         
            -
                  "num_tokens":  
     | 
| 417 | 
         
             
                  "step": 1075
         
     | 
| 418 | 
         
             
                },
         
     | 
| 419 | 
         
             
                {
         
     | 
| 420 | 
         
             
                  "epoch": 2.651780325890163,
         
     | 
| 421 | 
         
            -
                  "grad_norm": 0. 
     | 
| 422 | 
         
            -
                  "learning_rate": 0. 
     | 
| 423 | 
         
            -
                  "loss": 0. 
     | 
| 424 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 425 | 
         
            -
                  "num_tokens":  
     | 
| 426 | 
         
             
                  "step": 1100
         
     | 
| 427 | 
         
             
                },
         
     | 
| 428 | 
         
             
                {
         
     | 
| 429 | 
         
             
                  "epoch": 2.712130356065178,
         
     | 
| 430 | 
         
            -
                  "grad_norm": 0. 
     | 
| 431 | 
         
            -
                  "learning_rate": 0. 
     | 
| 432 | 
         
            -
                  "loss": 0. 
     | 
| 433 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 434 | 
         
            -
                  "num_tokens":  
     | 
| 435 | 
         
             
                  "step": 1125
         
     | 
| 436 | 
         
             
                },
         
     | 
| 437 | 
         
             
                {
         
     | 
| 438 | 
         
             
                  "epoch": 2.772480386240193,
         
     | 
| 439 | 
         
            -
                  "grad_norm": 0. 
     | 
| 440 | 
         
            -
                  "learning_rate": 0. 
     | 
| 441 | 
         
            -
                  "loss": 0. 
     | 
| 442 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 443 | 
         
            -
                  "num_tokens":  
     | 
| 444 | 
         
             
                  "step": 1150
         
     | 
| 445 | 
         
             
                },
         
     | 
| 446 | 
         
             
                {
         
     | 
| 447 | 
         
             
                  "epoch": 2.832830416415208,
         
     | 
| 448 | 
         
            -
                  "grad_norm": 0. 
     | 
| 449 | 
         
            -
                  "learning_rate": 0. 
     | 
| 450 | 
         
            -
                  "loss": 0. 
     | 
| 451 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 452 | 
         
            -
                  "num_tokens":  
     | 
| 453 | 
         
             
                  "step": 1175
         
     | 
| 454 | 
         
             
                },
         
     | 
| 455 | 
         
             
                {
         
     | 
| 456 | 
         
             
                  "epoch": 2.8931804465902236,
         
     | 
| 457 | 
         
            -
                  "grad_norm": 0. 
     | 
| 458 | 
         
            -
                  "learning_rate": 0. 
     | 
| 459 | 
         
            -
                  "loss": 0. 
     | 
| 460 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 461 | 
         
            -
                  "num_tokens":  
     | 
| 462 | 
         
             
                  "step": 1200
         
     | 
| 463 | 
         
             
                },
         
     | 
| 464 | 
         
             
                {
         
     | 
| 465 | 
         
             
                  "epoch": 2.9535304767652386,
         
     | 
| 466 | 
         
            -
                  "grad_norm": 0. 
     | 
| 467 | 
         
            -
                  "learning_rate": 0. 
     | 
| 468 | 
         
            -
                  "loss": 0. 
     | 
| 469 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 470 | 
         
            -
                  "num_tokens":  
     | 
| 471 | 
         
             
                  "step": 1225
         
     | 
| 472 | 
         
             
                },
         
     | 
| 473 | 
         
             
                {
         
     | 
| 474 | 
         
             
                  "epoch": 3.0,
         
     | 
| 475 | 
         
            -
                  "eval_loss": 0. 
     | 
| 476 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 477 | 
         
             
                  "eval_num_tokens": 6670539.0,
         
     | 
| 478 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 479 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 480 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 481 | 
         
             
                  "step": 1245
         
     | 
| 482 | 
         
             
                },
         
     | 
| 483 | 
         
             
                {
         
     | 
| 484 | 
         
             
                  "epoch": 3.012070006035003,
         
     | 
| 485 | 
         
            -
                  "grad_norm": 0. 
     | 
| 486 | 
         
            -
                  "learning_rate": 0. 
     | 
| 487 | 
         
            -
                  "loss": 0. 
     | 
| 488 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 489 | 
         
            -
                  "num_tokens":  
     | 
| 490 | 
         
             
                  "step": 1250
         
     | 
| 491 | 
         
             
                },
         
     | 
| 492 | 
         
             
                {
         
     | 
| 493 | 
         
             
                  "epoch": 3.0724200362100182,
         
     | 
| 494 | 
         
            -
                  "grad_norm": 0. 
     | 
| 495 | 
         
            -
                  "learning_rate": 0. 
     | 
| 496 | 
         
            -
                  "loss": 0. 
     | 
| 497 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 498 | 
         
            -
                  "num_tokens":  
     | 
| 499 | 
         
             
                  "step": 1275
         
     | 
| 500 | 
         
             
                },
         
     | 
| 501 | 
         
             
                {
         
     | 
| 502 | 
         
             
                  "epoch": 3.1327700663850333,
         
     | 
| 503 | 
         
            -
                  "grad_norm": 0. 
     | 
| 504 | 
         
            -
                  "learning_rate":  
     | 
| 505 | 
         
            -
                  "loss": 0. 
     | 
| 506 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 507 | 
         
            -
                  "num_tokens":  
     | 
| 508 | 
         
             
                  "step": 1300
         
     | 
| 509 | 
         
             
                },
         
     | 
| 510 | 
         
             
                {
         
     | 
| 511 | 
         
             
                  "epoch": 3.1931200965600484,
         
     | 
| 512 | 
         
            -
                  "grad_norm": 0. 
     | 
| 513 | 
         
            -
                  "learning_rate":  
     | 
| 514 | 
         
            -
                  "loss": 0. 
     | 
| 515 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 516 | 
         
            -
                  "num_tokens":  
     | 
| 517 | 
         
             
                  "step": 1325
         
     | 
| 518 | 
         
             
                },
         
     | 
| 519 | 
         
             
                {
         
     | 
| 520 | 
         
             
                  "epoch": 3.2534701267350634,
         
     | 
| 521 | 
         
            -
                  "grad_norm": 0. 
     | 
| 522 | 
         
            -
                  "learning_rate":  
     | 
| 523 | 
         
            -
                  "loss": 0. 
     | 
| 524 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 525 | 
         
            -
                  "num_tokens":  
     | 
| 526 | 
         
             
                  "step": 1350
         
     | 
| 527 | 
         
             
                },
         
     | 
| 528 | 
         
             
                {
         
     | 
| 529 | 
         
             
                  "epoch": 3.3138201569100785,
         
     | 
| 530 | 
         
            -
                  "grad_norm": 0. 
     | 
| 531 | 
         
            -
                  "learning_rate":  
     | 
| 532 | 
         
            -
                  "loss": 0. 
     | 
| 533 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 534 | 
         
            -
                  "num_tokens":  
     | 
| 535 | 
         
             
                  "step": 1375
         
     | 
| 536 | 
         
             
                },
         
     | 
| 537 | 
         
             
                {
         
     | 
| 538 | 
         
             
                  "epoch": 3.3741701870850935,
         
     | 
| 539 | 
         
            -
                  "grad_norm": 0. 
     | 
| 540 | 
         
            -
                  "learning_rate":  
     | 
| 541 | 
         
            -
                  "loss": 0. 
     | 
| 542 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 543 | 
         
            -
                  "num_tokens":  
     | 
| 544 | 
         
             
                  "step": 1400
         
     | 
| 545 | 
         
             
                },
         
     | 
| 546 | 
         
             
                {
         
     | 
| 547 | 
         
             
                  "epoch": 3.4345202172601086,
         
     | 
| 548 | 
         
            -
                  "grad_norm": 0. 
     | 
| 549 | 
         
            -
                  "learning_rate":  
     | 
| 550 | 
         
            -
                  "loss": 0. 
     | 
| 551 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 552 | 
         
            -
                  "num_tokens":  
     | 
| 553 | 
         
             
                  "step": 1425
         
     | 
| 554 | 
         
             
                },
         
     | 
| 555 | 
         
             
                {
         
     | 
| 556 | 
         
             
                  "epoch": 3.4948702474351236,
         
     | 
| 557 | 
         
            -
                  "grad_norm": 0. 
     | 
| 558 | 
         
            -
                  "learning_rate":  
     | 
| 559 | 
         
            -
                  "loss": 0. 
     | 
| 560 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 561 | 
         
            -
                  "num_tokens":  
     | 
| 562 | 
         
             
                  "step": 1450
         
     | 
| 563 | 
         
             
                },
         
     | 
| 564 | 
         
             
                {
         
     | 
| 565 | 
         
             
                  "epoch": 3.5552202776101387,
         
     | 
| 566 | 
         
            -
                  "grad_norm": 0. 
     | 
| 567 | 
         
            -
                  "learning_rate":  
     | 
| 568 | 
         
            -
                  "loss": 0. 
     | 
| 569 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 570 | 
         
            -
                  "num_tokens":  
     | 
| 571 | 
         
             
                  "step": 1475
         
     | 
| 572 | 
         
             
                },
         
     | 
| 573 | 
         
             
                {
         
     | 
| 574 | 
         
             
                  "epoch": 3.6155703077851538,
         
     | 
| 575 | 
         
            -
                  "grad_norm": 0. 
     | 
| 576 | 
         
            -
                  "learning_rate":  
     | 
| 577 | 
         
            -
                  "loss": 0. 
     | 
| 578 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 579 | 
         
            -
                  "num_tokens":  
     | 
| 580 | 
         
             
                  "step": 1500
         
     | 
| 581 | 
         
             
                },
         
     | 
| 582 | 
         
             
                {
         
     | 
| 583 | 
         
             
                  "epoch": 3.675920337960169,
         
     | 
| 584 | 
         
            -
                  "grad_norm": 0. 
     | 
| 585 | 
         
            -
                  "learning_rate":  
     | 
| 586 | 
         
            -
                  "loss": 0. 
     | 
| 587 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 588 | 
         
            -
                  "num_tokens":  
     | 
| 589 | 
         
             
                  "step": 1525
         
     | 
| 590 | 
         
             
                },
         
     | 
| 591 | 
         
             
                {
         
     | 
| 592 | 
         
             
                  "epoch": 3.736270368135184,
         
     | 
| 593 | 
         
            -
                  "grad_norm": 0. 
     | 
| 594 | 
         
            -
                  "learning_rate":  
     | 
| 595 | 
         
            -
                  "loss": 0. 
     | 
| 596 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 597 | 
         
            -
                  "num_tokens":  
     | 
| 598 | 
         
             
                  "step": 1550
         
     | 
| 599 | 
         
             
                },
         
     | 
| 600 | 
         
             
                {
         
     | 
| 601 | 
         
             
                  "epoch": 3.796620398310199,
         
     | 
| 602 | 
         
            -
                  "grad_norm": 0. 
     | 
| 603 | 
         
            -
                  "learning_rate":  
     | 
| 604 | 
         
            -
                  "loss": 0. 
     | 
| 605 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 606 | 
         
            -
                  "num_tokens":  
     | 
| 607 | 
         
             
                  "step": 1575
         
     | 
| 608 | 
         
             
                },
         
     | 
| 609 | 
         
             
                {
         
     | 
| 610 | 
         
             
                  "epoch": 3.856970428485214,
         
     | 
| 611 | 
         
            -
                  "grad_norm": 0. 
     | 
| 612 | 
         
            -
                  "learning_rate":  
     | 
| 613 | 
         
            -
                  "loss": 0. 
     | 
| 614 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 615 | 
         
            -
                  "num_tokens":  
     | 
| 616 | 
         
             
                  "step": 1600
         
     | 
| 617 | 
         
             
                },
         
     | 
| 618 | 
         
             
                {
         
     | 
| 619 | 
         
             
                  "epoch": 3.9173204586602295,
         
     | 
| 620 | 
         
            -
                  "grad_norm": 0. 
     | 
| 621 | 
         
            -
                  "learning_rate":  
     | 
| 622 | 
         
            -
                  "loss": 0. 
     | 
| 623 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 624 | 
         
            -
                  "num_tokens":  
     | 
| 625 | 
         
             
                  "step": 1625
         
     | 
| 626 | 
         
             
                },
         
     | 
| 627 | 
         
             
                {
         
     | 
| 628 | 
         
             
                  "epoch": 3.9776704888352445,
         
     | 
| 629 | 
         
            -
                  "grad_norm": 0. 
     | 
| 630 | 
         
            -
                  "learning_rate":  
     | 
| 631 | 
         
            -
                  "loss": 0. 
     | 
| 632 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 633 | 
         
            -
                  "num_tokens":  
     | 
| 634 | 
         
             
                  "step": 1650
         
     | 
| 635 | 
         
             
                },
         
     | 
| 636 | 
         
             
                {
         
     | 
| 637 | 
         
             
                  "epoch": 4.0,
         
     | 
| 638 | 
         
            -
                  "eval_loss": 0. 
     | 
| 639 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 640 | 
         
             
                  "eval_num_tokens": 8894052.0,
         
     | 
| 641 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 642 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 643 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 644 | 
         
             
                  "step": 1660
         
     | 
| 645 | 
         
             
                }
         
     | 
| 646 | 
         
             
              ],
         
     | 
| 647 | 
         
             
              "logging_steps": 25,
         
     | 
| 648 | 
         
            -
              "max_steps":  
     | 
| 649 | 
         
             
              "num_input_tokens_seen": 0,
         
     | 
| 650 | 
         
            -
              "num_train_epochs":  
     | 
| 651 | 
         
             
              "save_steps": 500,
         
     | 
| 652 | 
         
             
              "stateful_callbacks": {
         
     | 
| 653 | 
         
             
                "TrainerControl": {
         
     | 
| 
         @@ -661,7 +661,7 @@ 
     | 
|
| 661 | 
         
             
                  "attributes": {}
         
     | 
| 662 | 
         
             
                }
         
     | 
| 663 | 
         
             
              },
         
     | 
| 664 | 
         
            -
              "total_flos": 3. 
     | 
| 665 | 
         
             
              "train_batch_size": 2,
         
     | 
| 666 | 
         
             
              "trial_name": null,
         
     | 
| 667 | 
         
             
              "trial_params": null
         
     | 
| 
         | 
|
| 11 | 
         
             
              "log_history": [
         
     | 
| 12 | 
         
             
                {
         
     | 
| 13 | 
         
             
                  "epoch": 0.060350030175015085,
         
     | 
| 14 | 
         
            +
                  "grad_norm": 0.7244853377342224,
         
     | 
| 15 | 
         
            +
                  "learning_rate": 0.00011428571428571427,
         
     | 
| 16 | 
         
            +
                  "loss": 1.5091,
         
     | 
| 17 | 
         
            +
                  "mean_token_accuracy": 0.6793113535642624,
         
     | 
| 18 | 
         
            +
                  "num_tokens": 152165.0,
         
     | 
| 19 | 
         
             
                  "step": 25
         
     | 
| 20 | 
         
             
                },
         
     | 
| 21 | 
         
             
                {
         
     | 
| 22 | 
         
             
                  "epoch": 0.12070006035003017,
         
     | 
| 23 | 
         
            +
                  "grad_norm": 0.8389242887496948,
         
     | 
| 24 | 
         
            +
                  "learning_rate": 0.0002333333333333333,
         
     | 
| 25 | 
         
            +
                  "loss": 0.8436,
         
     | 
| 26 | 
         
            +
                  "mean_token_accuracy": 0.7881802421808243,
         
     | 
| 27 | 
         
            +
                  "num_tokens": 267390.0,
         
     | 
| 28 | 
         
             
                  "step": 50
         
     | 
| 29 | 
         
             
                },
         
     | 
| 30 | 
         
             
                {
         
     | 
| 31 | 
         
             
                  "epoch": 0.18105009052504525,
         
     | 
| 32 | 
         
            +
                  "grad_norm": 0.6344988942146301,
         
     | 
| 33 | 
         
            +
                  "learning_rate": 0.00029997787517981614,
         
     | 
| 34 | 
         
            +
                  "loss": 0.5527,
         
     | 
| 35 | 
         
            +
                  "mean_token_accuracy": 0.8469069242477417,
         
     | 
| 36 | 
         
            +
                  "num_tokens": 420975.0,
         
     | 
| 37 | 
         
             
                  "step": 75
         
     | 
| 38 | 
         
             
                },
         
     | 
| 39 | 
         
             
                {
         
     | 
| 40 | 
         
             
                  "epoch": 0.24140012070006034,
         
     | 
| 41 | 
         
            +
                  "grad_norm": 0.7947192192077637,
         
     | 
| 42 | 
         
            +
                  "learning_rate": 0.0002997630832860032,
         
     | 
| 43 | 
         
            +
                  "loss": 0.4522,
         
     | 
| 44 | 
         
            +
                  "mean_token_accuracy": 0.870941441655159,
         
     | 
| 45 | 
         
            +
                  "num_tokens": 538380.0,
         
     | 
| 46 | 
         
             
                  "step": 100
         
     | 
| 47 | 
         
             
                },
         
     | 
| 48 | 
         
             
                {
         
     | 
| 49 | 
         
             
                  "epoch": 0.30175015087507545,
         
     | 
| 50 | 
         
            +
                  "grad_norm": 0.43716728687286377,
         
     | 
| 51 | 
         
            +
                  "learning_rate": 0.0002993201135681549,
         
     | 
| 52 | 
         
            +
                  "loss": 0.3049,
         
     | 
| 53 | 
         
            +
                  "mean_token_accuracy": 0.9136220461130142,
         
     | 
| 54 | 
         
            +
                  "num_tokens": 690650.0,
         
     | 
| 55 | 
         
             
                  "step": 125
         
     | 
| 56 | 
         
             
                },
         
     | 
| 57 | 
         
             
                {
         
     | 
| 58 | 
         
             
                  "epoch": 0.3621001810500905,
         
     | 
| 59 | 
         
            +
                  "grad_norm": 1.09097421169281,
         
     | 
| 60 | 
         
            +
                  "learning_rate": 0.0002986496409313553,
         
     | 
| 61 | 
         
            +
                  "loss": 0.3172,
         
     | 
| 62 | 
         
            +
                  "mean_token_accuracy": 0.91127048432827,
         
     | 
| 63 | 
         
            +
                  "num_tokens": 806066.0,
         
     | 
| 64 | 
         
             
                  "step": 150
         
     | 
| 65 | 
         
             
                },
         
     | 
| 66 | 
         
             
                {
         
     | 
| 67 | 
         
             
                  "epoch": 0.4224502112251056,
         
     | 
| 68 | 
         
            +
                  "grad_norm": 0.3773705065250397,
         
     | 
| 69 | 
         
            +
                  "learning_rate": 0.0002977526869022985,
         
     | 
| 70 | 
         
            +
                  "loss": 0.2029,
         
     | 
| 71 | 
         
            +
                  "mean_token_accuracy": 0.9433162885904313,
         
     | 
| 72 | 
         
            +
                  "num_tokens": 960853.0,
         
     | 
| 73 | 
         
             
                  "step": 175
         
     | 
| 74 | 
         
             
                },
         
     | 
| 75 | 
         
             
                {
         
     | 
| 76 | 
         
             
                  "epoch": 0.4828002414001207,
         
     | 
| 77 | 
         
            +
                  "grad_norm": 0.8292771577835083,
         
     | 
| 78 | 
         
            +
                  "learning_rate": 0.0002966306180728982,
         
     | 
| 79 | 
         
            +
                  "loss": 0.2274,
         
     | 
| 80 | 
         
            +
                  "mean_token_accuracy": 0.9385988712310791,
         
     | 
| 81 | 
         
            +
                  "num_tokens": 1077726.0,
         
     | 
| 82 | 
         
             
                  "step": 200
         
     | 
| 83 | 
         
             
                },
         
     | 
| 84 | 
         
             
                {
         
     | 
| 85 | 
         
             
                  "epoch": 0.5431502715751357,
         
     | 
| 86 | 
         
            +
                  "grad_norm": 0.4765889346599579,
         
     | 
| 87 | 
         
            +
                  "learning_rate": 0.0002952851440181598,
         
     | 
| 88 | 
         
            +
                  "loss": 0.19,
         
     | 
| 89 | 
         
            +
                  "mean_token_accuracy": 0.9479016721248626,
         
     | 
| 90 | 
         
            +
                  "num_tokens": 1232263.0,
         
     | 
| 91 | 
         
             
                  "step": 225
         
     | 
| 92 | 
         
             
                },
         
     | 
| 93 | 
         
             
                {
         
     | 
| 94 | 
         
             
                  "epoch": 0.6035003017501509,
         
     | 
| 95 | 
         
            +
                  "grad_norm": 0.9254749417304993,
         
     | 
| 96 | 
         
            +
                  "learning_rate": 0.0002937183146914856,
         
     | 
| 97 | 
         
            +
                  "loss": 0.1826,
         
     | 
| 98 | 
         
            +
                  "mean_token_accuracy": 0.9498224484920502,
         
     | 
| 99 | 
         
            +
                  "num_tokens": 1349057.0,
         
     | 
| 100 | 
         
             
                  "step": 250
         
     | 
| 101 | 
         
             
                },
         
     | 
| 102 | 
         
             
                {
         
     | 
| 103 | 
         
             
                  "epoch": 0.663850331925166,
         
     | 
| 104 | 
         
            +
                  "grad_norm": 0.4938018023967743,
         
     | 
| 105 | 
         
            +
                  "learning_rate": 0.000291932517301382,
         
     | 
| 106 | 
         
            +
                  "loss": 0.1497,
         
     | 
| 107 | 
         
            +
                  "mean_token_accuracy": 0.9588899296522141,
         
     | 
| 108 | 
         
            +
                  "num_tokens": 1496867.0,
         
     | 
| 109 | 
         
             
                  "step": 275
         
     | 
| 110 | 
         
             
                },
         
     | 
| 111 | 
         
             
                {
         
     | 
| 112 | 
         
             
                  "epoch": 0.724200362100181,
         
     | 
| 113 | 
         
            +
                  "grad_norm": 0.6995358467102051,
         
     | 
| 114 | 
         
            +
                  "learning_rate": 0.00028993047267432864,
         
     | 
| 115 | 
         
            +
                  "loss": 0.1578,
         
     | 
| 116 | 
         
            +
                  "mean_token_accuracy": 0.9568761509656906,
         
     | 
| 117 | 
         
            +
                  "num_tokens": 1610727.0,
         
     | 
| 118 | 
         
             
                  "step": 300
         
     | 
| 119 | 
         
             
                },
         
     | 
| 120 | 
         
             
                {
         
     | 
| 121 | 
         
             
                  "epoch": 0.7845503922751962,
         
     | 
| 122 | 
         
            +
                  "grad_norm": 0.46799567341804504,
         
     | 
| 123 | 
         
            +
                  "learning_rate": 0.0002877152311093483,
         
     | 
| 124 | 
         
            +
                  "loss": 0.1351,
         
     | 
| 125 | 
         
            +
                  "mean_token_accuracy": 0.9633717983961105,
         
     | 
| 126 | 
         
            +
                  "num_tokens": 1762041.0,
         
     | 
| 127 | 
         
             
                  "step": 325
         
     | 
| 128 | 
         
             
                },
         
     | 
| 129 | 
         
             
                {
         
     | 
| 130 | 
         
             
                  "epoch": 0.8449004224502112,
         
     | 
| 131 | 
         
            +
                  "grad_norm": 0.6729409098625183,
         
     | 
| 132 | 
         
            +
                  "learning_rate": 0.00028529016773059656,
         
     | 
| 133 | 
         
            +
                  "loss": 0.1206,
         
     | 
| 134 | 
         
            +
                  "mean_token_accuracy": 0.9687577307224273,
         
     | 
| 135 | 
         
            +
                  "num_tokens": 1877965.0,
         
     | 
| 136 | 
         
             
                  "step": 350
         
     | 
| 137 | 
         
             
                },
         
     | 
| 138 | 
         
             
                {
         
     | 
| 139 | 
         
             
                  "epoch": 0.9052504526252263,
         
     | 
| 140 | 
         
            +
                  "grad_norm": 0.5820412635803223,
         
     | 
| 141 | 
         
            +
                  "learning_rate": 0.00028265897734504976,
         
     | 
| 142 | 
         
            +
                  "loss": 0.1183,
         
     | 
| 143 | 
         
            +
                  "mean_token_accuracy": 0.96822787463665,
         
     | 
| 144 | 
         
            +
                  "num_tokens": 2028343.0,
         
     | 
| 145 | 
         
             
                  "step": 375
         
     | 
| 146 | 
         
             
                },
         
     | 
| 147 | 
         
             
                {
         
     | 
| 148 | 
         
             
                  "epoch": 0.9656004828002414,
         
     | 
| 149 | 
         
            +
                  "grad_norm": 0.8604497909545898,
         
     | 
| 150 | 
         
            +
                  "learning_rate": 0.0002798256688131267,
         
     | 
| 151 | 
         
            +
                  "loss": 0.1159,
         
     | 
| 152 | 
         
            +
                  "mean_token_accuracy": 0.9700725018978119,
         
     | 
| 153 | 
         
            +
                  "num_tokens": 2145044.0,
         
     | 
| 154 | 
         
             
                  "step": 400
         
     | 
| 155 | 
         
             
                },
         
     | 
| 156 | 
         
             
                {
         
     | 
| 157 | 
         
             
                  "epoch": 1.0,
         
     | 
| 158 | 
         
            +
                  "eval_loss": 0.1169130727648735,
         
     | 
| 159 | 
         
            +
                  "eval_mean_token_accuracy": 0.9691641559471955,
         
     | 
| 160 | 
         
             
                  "eval_num_tokens": 2223513.0,
         
     | 
| 161 | 
         
            +
                  "eval_runtime": 60.5832,
         
     | 
| 162 | 
         
            +
                  "eval_samples_per_second": 6.091,
         
     | 
| 163 | 
         
            +
                  "eval_steps_per_second": 3.054,
         
     | 
| 164 | 
         
             
                  "step": 415
         
     | 
| 165 | 
         
             
                },
         
     | 
| 166 | 
         
             
                {
         
     | 
| 167 | 
         
             
                  "epoch": 1.024140012070006,
         
     | 
| 168 | 
         
            +
                  "grad_norm": 0.20096616446971893,
         
     | 
| 169 | 
         
            +
                  "learning_rate": 0.0002767945589408217,
         
     | 
| 170 | 
         
            +
                  "loss": 0.122,
         
     | 
| 171 | 
         
            +
                  "mean_token_accuracy": 0.9680000224064306,
         
     | 
| 172 | 
         
            +
                  "num_tokens": 2291746.0,
         
     | 
| 173 | 
         
             
                  "step": 425
         
     | 
| 174 | 
         
             
                },
         
     | 
| 175 | 
         
             
                {
         
     | 
| 176 | 
         
             
                  "epoch": 1.0844900422450212,
         
     | 
| 177 | 
         
            +
                  "grad_norm": 0.34665247797966003,
         
     | 
| 178 | 
         
            +
                  "learning_rate": 0.0002735702659026533,
         
     | 
| 179 | 
         
            +
                  "loss": 0.0836,
         
     | 
| 180 | 
         
            +
                  "mean_token_accuracy": 0.9780776232481003,
         
     | 
| 181 | 
         
            +
                  "num_tokens": 2424528.0,
         
     | 
| 182 | 
         
             
                  "step": 450
         
     | 
| 183 | 
         
             
                },
         
     | 
| 184 | 
         
             
                {
         
     | 
| 185 | 
         
             
                  "epoch": 1.1448400724200363,
         
     | 
| 186 | 
         
            +
                  "grad_norm": 0.30349963903427124,
         
     | 
| 187 | 
         
            +
                  "learning_rate": 0.0002701577022054515,
         
     | 
| 188 | 
         
            +
                  "loss": 0.1019,
         
     | 
| 189 | 
         
            +
                  "mean_token_accuracy": 0.9732917118072509,
         
     | 
| 190 | 
         
            +
                  "num_tokens": 2557091.0,
         
     | 
| 191 | 
         
             
                  "step": 475
         
     | 
| 192 | 
         
             
                },
         
     | 
| 193 | 
         
             
                {
         
     | 
| 194 | 
         
             
                  "epoch": 1.2051901025950513,
         
     | 
| 195 | 
         
            +
                  "grad_norm": 0.3892677426338196,
         
     | 
| 196 | 
         
            +
                  "learning_rate": 0.0002665620672037014,
         
     | 
| 197 | 
         
            +
                  "loss": 0.0831,
         
     | 
| 198 | 
         
            +
                  "mean_token_accuracy": 0.9782004028558731,
         
     | 
| 199 | 
         
            +
                  "num_tokens": 2691527.0,
         
     | 
| 200 | 
         
             
                  "step": 500
         
     | 
| 201 | 
         
             
                },
         
     | 
| 202 | 
         
             
                {
         
     | 
| 203 | 
         
             
                  "epoch": 1.2655401327700664,
         
     | 
| 204 | 
         
            +
                  "grad_norm": 0.29889699816703796,
         
     | 
| 205 | 
         
            +
                  "learning_rate": 0.0002627888391778493,
         
     | 
| 206 | 
         
            +
                  "loss": 0.1023,
         
     | 
| 207 | 
         
            +
                  "mean_token_accuracy": 0.9729781967401504,
         
     | 
| 208 | 
         
            +
                  "num_tokens": 2824699.0,
         
     | 
| 209 | 
         
             
                  "step": 525
         
     | 
| 210 | 
         
             
                },
         
     | 
| 211 | 
         
             
                {
         
     | 
| 212 | 
         
             
                  "epoch": 1.3258901629450814,
         
     | 
| 213 | 
         
            +
                  "grad_norm": 0.393573522567749,
         
     | 
| 214 | 
         
            +
                  "learning_rate": 0.0002588437669876384,
         
     | 
| 215 | 
         
            +
                  "loss": 0.0779,
         
     | 
| 216 | 
         
            +
                  "mean_token_accuracy": 0.9795191860198975,
         
     | 
| 217 | 
         
            +
                  "num_tokens": 2958826.0,
         
     | 
| 218 | 
         
             
                  "step": 550
         
     | 
| 219 | 
         
             
                },
         
     | 
| 220 | 
         
             
                {
         
     | 
| 221 | 
         
             
                  "epoch": 1.3862401931200965,
         
     | 
| 222 | 
         
            +
                  "grad_norm": 0.26299118995666504,
         
     | 
| 223 | 
         
            +
                  "learning_rate": 0.00025473286131319283,
         
     | 
| 224 | 
         
            +
                  "loss": 0.0988,
         
     | 
| 225 | 
         
            +
                  "mean_token_accuracy": 0.9739746767282486,
         
     | 
| 226 | 
         
            +
                  "num_tokens": 3092320.0,
         
     | 
| 227 | 
         
             
                  "step": 575
         
     | 
| 228 | 
         
             
                },
         
     | 
| 229 | 
         
             
                {
         
     | 
| 230 | 
         
             
                  "epoch": 1.4465902232951118,
         
     | 
| 231 | 
         
            +
                  "grad_norm": 0.3649594783782959,
         
     | 
| 232 | 
         
            +
                  "learning_rate": 0.0002504623854971937,
         
     | 
| 233 | 
         
            +
                  "loss": 0.0729,
         
     | 
| 234 | 
         
            +
                  "mean_token_accuracy": 0.9814109367132187,
         
     | 
| 235 | 
         
            +
                  "num_tokens": 3227452.0,
         
     | 
| 236 | 
         
             
                  "step": 600
         
     | 
| 237 | 
         
             
                },
         
     | 
| 238 | 
         
             
                {
         
     | 
| 239 | 
         
             
                  "epoch": 1.5069402534701268,
         
     | 
| 240 | 
         
            +
                  "grad_norm": 0.28632357716560364,
         
     | 
| 241 | 
         
            +
                  "learning_rate": 0.00024603884600210097,
         
     | 
| 242 | 
         
            +
                  "loss": 0.0957,
         
     | 
| 243 | 
         
            +
                  "mean_token_accuracy": 0.9748889011144638,
         
     | 
| 244 | 
         
            +
                  "num_tokens": 3361210.0,
         
     | 
| 245 | 
         
             
                  "step": 625
         
     | 
| 246 | 
         
             
                },
         
     | 
| 247 | 
         
             
                {
         
     | 
| 248 | 
         
             
                  "epoch": 1.567290283645142,
         
     | 
| 249 | 
         
            +
                  "grad_norm": 0.25492990016937256,
         
     | 
| 250 | 
         
            +
                  "learning_rate": 0.00024146898249695974,
         
     | 
| 251 | 
         
            +
                  "loss": 0.075,
         
     | 
| 252 | 
         
            +
                  "mean_token_accuracy": 0.9806595808267593,
         
     | 
| 253 | 
         
            +
                  "num_tokens": 3497177.0,
         
     | 
| 254 | 
         
             
                  "step": 650
         
     | 
| 255 | 
         
             
                },
         
     | 
| 256 | 
         
             
                {
         
     | 
| 257 | 
         
             
                  "epoch": 1.627640313820157,
         
     | 
| 258 | 
         
            +
                  "grad_norm": 0.37043872475624084,
         
     | 
| 259 | 
         
            +
                  "learning_rate": 0.00023675975758889506,
         
     | 
| 260 | 
         
            +
                  "loss": 0.0918,
         
     | 
| 261 | 
         
            +
                  "mean_token_accuracy": 0.9762868732213974,
         
     | 
| 262 | 
         
            +
                  "num_tokens": 3630834.0,
         
     | 
| 263 | 
         
             
                  "step": 675
         
     | 
| 264 | 
         
             
                },
         
     | 
| 265 | 
         
             
                {
         
     | 
| 266 | 
         
             
                  "epoch": 1.687990343995172,
         
     | 
| 267 | 
         
            +
                  "grad_norm": 0.26372411847114563,
         
     | 
| 268 | 
         
            +
                  "learning_rate": 0.00023191834621493968,
         
     | 
| 269 | 
         
            +
                  "loss": 0.0674,
         
     | 
| 270 | 
         
            +
                  "mean_token_accuracy": 0.9826526433229447,
         
     | 
| 271 | 
         
            +
                  "num_tokens": 3766598.0,
         
     | 
| 272 | 
         
             
                  "step": 700
         
     | 
| 273 | 
         
             
                },
         
     | 
| 274 | 
         
             
                {
         
     | 
| 275 | 
         
             
                  "epoch": 1.748340374170187,
         
     | 
| 276 | 
         
            +
                  "grad_norm": 0.2400335669517517,
         
     | 
| 277 | 
         
            +
                  "learning_rate": 0.00022695212471035816,
         
     | 
| 278 | 
         
            +
                  "loss": 0.0807,
         
     | 
| 279 | 
         
            +
                  "mean_token_accuracy": 0.9793906199932099,
         
     | 
| 280 | 
         
            +
                  "num_tokens": 3899644.0,
         
     | 
| 281 | 
         
             
                  "step": 725
         
     | 
| 282 | 
         
             
                },
         
     | 
| 283 | 
         
             
                {
         
     | 
| 284 | 
         
             
                  "epoch": 1.8086904043452021,
         
     | 
| 285 | 
         
            +
                  "grad_norm": 0.19833268225193024,
         
     | 
| 286 | 
         
            +
                  "learning_rate": 0.0002218686595701219,
         
     | 
| 287 | 
         
            +
                  "loss": 0.0655,
         
     | 
| 288 | 
         
            +
                  "mean_token_accuracy": 0.9832920217514038,
         
     | 
| 289 | 
         
            +
                  "num_tokens": 4036037.0,
         
     | 
| 290 | 
         
             
                  "step": 750
         
     | 
| 291 | 
         
             
                },
         
     | 
| 292 | 
         
             
                {
         
     | 
| 293 | 
         
             
                  "epoch": 1.8690404345202172,
         
     | 
| 294 | 
         
            +
                  "grad_norm": 0.17969554662704468,
         
     | 
| 295 | 
         
            +
                  "learning_rate": 0.0002166756959206587,
         
     | 
| 296 | 
         
            +
                  "loss": 0.0831,
         
     | 
| 297 | 
         
            +
                  "mean_token_accuracy": 0.9791438663005829,
         
     | 
| 298 | 
         
            +
                  "num_tokens": 4168035.0,
         
     | 
| 299 | 
         
             
                  "step": 775
         
     | 
| 300 | 
         
             
                },
         
     | 
| 301 | 
         
             
                {
         
     | 
| 302 | 
         
             
                  "epoch": 1.9293904646952322,
         
     | 
| 303 | 
         
            +
                  "grad_norm": 0.3069966733455658,
         
     | 
| 304 | 
         
            +
                  "learning_rate": 0.00021138114571944054,
         
     | 
| 305 | 
         
            +
                  "loss": 0.0624,
         
     | 
| 306 | 
         
            +
                  "mean_token_accuracy": 0.9839604765176773,
         
     | 
| 307 | 
         
            +
                  "num_tokens": 4302324.0,
         
     | 
| 308 | 
         
             
                  "step": 800
         
     | 
| 309 | 
         
             
                },
         
     | 
| 310 | 
         
             
                {
         
     | 
| 311 | 
         
             
                  "epoch": 1.9897404948702473,
         
     | 
| 312 | 
         
            +
                  "grad_norm": 0.26080530881881714,
         
     | 
| 313 | 
         
            +
                  "learning_rate": 0.000205993075700389,
         
     | 
| 314 | 
         
            +
                  "loss": 0.0728,
         
     | 
| 315 | 
         
            +
                  "mean_token_accuracy": 0.9816776049137116,
         
     | 
| 316 | 
         
            +
                  "num_tokens": 4428521.0,
         
     | 
| 317 | 
         
             
                  "step": 825
         
     | 
| 318 | 
         
             
                },
         
     | 
| 319 | 
         
             
                {
         
     | 
| 320 | 
         
             
                  "epoch": 2.0,
         
     | 
| 321 | 
         
            +
                  "eval_loss": 0.07739538699388504,
         
     | 
| 322 | 
         
            +
                  "eval_mean_token_accuracy": 0.9806474750106399,
         
     | 
| 323 | 
         
             
                  "eval_num_tokens": 4447026.0,
         
     | 
| 324 | 
         
            +
                  "eval_runtime": 60.6735,
         
     | 
| 325 | 
         
            +
                  "eval_samples_per_second": 6.082,
         
     | 
| 326 | 
         
            +
                  "eval_steps_per_second": 3.049,
         
     | 
| 327 | 
         
             
                  "step": 830
         
     | 
| 328 | 
         
             
                },
         
     | 
| 329 | 
         
             
                {
         
     | 
| 330 | 
         
             
                  "epoch": 2.048280024140012,
         
     | 
| 331 | 
         
            +
                  "grad_norm": 0.32912909984588623,
         
     | 
| 332 | 
         
            +
                  "learning_rate": 0.00020051969508346498,
         
     | 
| 333 | 
         
            +
                  "loss": 0.0624,
         
     | 
| 334 | 
         
            +
                  "mean_token_accuracy": 0.98369190680612,
         
     | 
| 335 | 
         
            +
                  "num_tokens": 4571335.0,
         
     | 
| 336 | 
         
             
                  "step": 850
         
     | 
| 337 | 
         
             
                },
         
     | 
| 338 | 
         
             
                {
         
     | 
| 339 | 
         
             
                  "epoch": 2.1086300543150274,
         
     | 
| 340 | 
         
            +
                  "grad_norm": 0.22884123027324677,
         
     | 
| 341 | 
         
            +
                  "learning_rate": 0.00019496934306716706,
         
     | 
| 342 | 
         
            +
                  "loss": 0.0543,
         
     | 
| 343 | 
         
            +
                  "mean_token_accuracy": 0.9862597143650055,
         
     | 
| 344 | 
         
            +
                  "num_tokens": 4694373.0,
         
     | 
| 345 | 
         
             
                  "step": 875
         
     | 
| 346 | 
         
             
                },
         
     | 
| 347 | 
         
             
                {
         
     | 
| 348 | 
         
             
                  "epoch": 2.1689800844900424,
         
     | 
| 349 | 
         
            +
                  "grad_norm": 0.15646718442440033,
         
     | 
| 350 | 
         
            +
                  "learning_rate": 0.00018935047612299625,
         
     | 
| 351 | 
         
            +
                  "loss": 0.0683,
         
     | 
| 352 | 
         
            +
                  "mean_token_accuracy": 0.9817469125986099,
         
     | 
| 353 | 
         
            +
                  "num_tokens": 4840032.0,
         
     | 
| 354 | 
         
             
                  "step": 900
         
     | 
| 355 | 
         
             
                },
         
     | 
| 356 | 
         
             
                {
         
     | 
| 357 | 
         
             
                  "epoch": 2.2293301146650575,
         
     | 
| 358 | 
         
            +
                  "grad_norm": 0.32684165239334106,
         
     | 
| 359 | 
         
            +
                  "learning_rate": 0.00018367165511124414,
         
     | 
| 360 | 
         
            +
                  "loss": 0.0558,
         
     | 
| 361 | 
         
            +
                  "mean_token_accuracy": 0.9862085193395614,
         
     | 
| 362 | 
         
            +
                  "num_tokens": 4962900.0,
         
     | 
| 363 | 
         
             
                  "step": 925
         
     | 
| 364 | 
         
             
                },
         
     | 
| 365 | 
         
             
                {
         
     | 
| 366 | 
         
             
                  "epoch": 2.2896801448400725,
         
     | 
| 367 | 
         
            +
                  "grad_norm": 0.15353620052337646,
         
     | 
| 368 | 
         
            +
                  "learning_rate": 0.00017794153223773558,
         
     | 
| 369 | 
         
            +
                  "loss": 0.0649,
         
     | 
| 370 | 
         
            +
                  "mean_token_accuracy": 0.9830775827169418,
         
     | 
| 371 | 
         
            +
                  "num_tokens": 5107775.0,
         
     | 
| 372 | 
         
             
                  "step": 950
         
     | 
| 373 | 
         
             
                },
         
     | 
| 374 | 
         
             
                {
         
     | 
| 375 | 
         
             
                  "epoch": 2.3500301750150876,
         
     | 
| 376 | 
         
            +
                  "grad_norm": 0.13864906132221222,
         
     | 
| 377 | 
         
            +
                  "learning_rate": 0.00017216883787139772,
         
     | 
| 378 | 
         
            +
                  "loss": 0.0513,
         
     | 
| 379 | 
         
            +
                  "mean_token_accuracy": 0.9871918082237243,
         
     | 
| 380 | 
         
            +
                  "num_tokens": 5231159.0,
         
     | 
| 381 | 
         
             
                  "step": 975
         
     | 
| 382 | 
         
             
                },
         
     | 
| 383 | 
         
             
                {
         
     | 
| 384 | 
         
             
                  "epoch": 2.4103802051901027,
         
     | 
| 385 | 
         
            +
                  "grad_norm": 0.18856066465377808,
         
     | 
| 386 | 
         
            +
                  "learning_rate": 0.00016636236724274,
         
     | 
| 387 | 
         
            +
                  "loss": 0.0653,
         
     | 
| 388 | 
         
            +
                  "mean_token_accuracy": 0.9824860644340515,
         
     | 
| 389 | 
         
            +
                  "num_tokens": 5375658.0,
         
     | 
| 390 | 
         
             
                  "step": 1000
         
     | 
| 391 | 
         
             
                },
         
     | 
| 392 | 
         
             
                {
         
     | 
| 393 | 
         
             
                  "epoch": 2.4707302353651177,
         
     | 
| 394 | 
         
            +
                  "grad_norm": 0.1747666597366333,
         
     | 
| 395 | 
         
            +
                  "learning_rate": 0.00016053096704351255,
         
     | 
| 396 | 
         
            +
                  "loss": 0.0536,
         
     | 
| 397 | 
         
            +
                  "mean_token_accuracy": 0.9870379114151001,
         
     | 
| 398 | 
         
            +
                  "num_tokens": 5498792.0,
         
     | 
| 399 | 
         
             
                  "step": 1025
         
     | 
| 400 | 
         
             
                },
         
     | 
| 401 | 
         
             
                {
         
     | 
| 402 | 
         
             
                  "epoch": 2.5310802655401328,
         
     | 
| 403 | 
         
            +
                  "grad_norm": 0.08616527169942856,
         
     | 
| 404 | 
         
            +
                  "learning_rate": 0.00015468352194795791,
         
     | 
| 405 | 
         
            +
                  "loss": 0.0605,
         
     | 
| 406 | 
         
            +
                  "mean_token_accuracy": 0.9837486296892166,
         
     | 
| 407 | 
         
            +
                  "num_tokens": 5644155.0,
         
     | 
| 408 | 
         
             
                  "step": 1050
         
     | 
| 409 | 
         
             
                },
         
     | 
| 410 | 
         
             
                {
         
     | 
| 411 | 
         
             
                  "epoch": 2.591430295715148,
         
     | 
| 412 | 
         
            +
                  "grad_norm": 0.21047131717205048,
         
     | 
| 413 | 
         
            +
                  "learning_rate": 0.00014882894107619277,
         
     | 
| 414 | 
         
            +
                  "loss": 0.0502,
         
     | 
| 415 | 
         
            +
                  "mean_token_accuracy": 0.9874639976024627,
         
     | 
| 416 | 
         
            +
                  "num_tokens": 5768255.0,
         
     | 
| 417 | 
         
             
                  "step": 1075
         
     | 
| 418 | 
         
             
                },
         
     | 
| 419 | 
         
             
                {
         
     | 
| 420 | 
         
             
                  "epoch": 2.651780325890163,
         
     | 
| 421 | 
         
            +
                  "grad_norm": 0.09520892798900604,
         
     | 
| 422 | 
         
            +
                  "learning_rate": 0.00014297614442034518,
         
     | 
| 423 | 
         
            +
                  "loss": 0.0568,
         
     | 
| 424 | 
         
            +
                  "mean_token_accuracy": 0.9851021945476532,
         
     | 
| 425 | 
         
            +
                  "num_tokens": 5913228.0,
         
     | 
| 426 | 
         
             
                  "step": 1100
         
     | 
| 427 | 
         
             
                },
         
     | 
| 428 | 
         
             
                {
         
     | 
| 429 | 
         
             
                  "epoch": 2.712130356065178,
         
     | 
| 430 | 
         
            +
                  "grad_norm": 0.11644323915243149,
         
     | 
| 431 | 
         
            +
                  "learning_rate": 0.000137134049254126,
         
     | 
| 432 | 
         
            +
                  "loss": 0.0523,
         
     | 
| 433 | 
         
            +
                  "mean_token_accuracy": 0.9867914581298828,
         
     | 
| 434 | 
         
            +
                  "num_tokens": 6037285.0,
         
     | 
| 435 | 
         
             
                  "step": 1125
         
     | 
| 436 | 
         
             
                },
         
     | 
| 437 | 
         
             
                {
         
     | 
| 438 | 
         
             
                  "epoch": 2.772480386240193,
         
     | 
| 439 | 
         
            +
                  "grad_norm": 0.12872624397277832,
         
     | 
| 440 | 
         
            +
                  "learning_rate": 0.000131311556546543,
         
     | 
| 441 | 
         
            +
                  "loss": 0.0563,
         
     | 
| 442 | 
         
            +
                  "mean_token_accuracy": 0.9849929654598236,
         
     | 
| 443 | 
         
            +
                  "num_tokens": 6183361.0,
         
     | 
| 444 | 
         
             
                  "step": 1150
         
     | 
| 445 | 
         
             
                },
         
     | 
| 446 | 
         
             
                {
         
     | 
| 447 | 
         
             
                  "epoch": 2.832830416415208,
         
     | 
| 448 | 
         
            +
                  "grad_norm": 0.10195529460906982,
         
     | 
| 449 | 
         
            +
                  "learning_rate": 0.0001255175374004563,
         
     | 
| 450 | 
         
            +
                  "loss": 0.0501,
         
     | 
| 451 | 
         
            +
                  "mean_token_accuracy": 0.9871714848279953,
         
     | 
| 452 | 
         
            +
                  "num_tokens": 6305713.0,
         
     | 
| 453 | 
         
             
                  "step": 1175
         
     | 
| 454 | 
         
             
                },
         
     | 
| 455 | 
         
             
                {
         
     | 
| 456 | 
         
             
                  "epoch": 2.8931804465902236,
         
     | 
| 457 | 
         
            +
                  "grad_norm": 0.09452041983604431,
         
     | 
| 458 | 
         
            +
                  "learning_rate": 0.0001197608195366377,
         
     | 
| 459 | 
         
            +
                  "loss": 0.0581,
         
     | 
| 460 | 
         
            +
                  "mean_token_accuracy": 0.9840293884277344,
         
     | 
| 461 | 
         
            +
                  "num_tokens": 6451719.0,
         
     | 
| 462 | 
         
             
                  "step": 1200
         
     | 
| 463 | 
         
             
                },
         
     | 
| 464 | 
         
             
                {
         
     | 
| 465 | 
         
             
                  "epoch": 2.9535304767652386,
         
     | 
| 466 | 
         
            +
                  "grad_norm": 0.17165224254131317,
         
     | 
| 467 | 
         
            +
                  "learning_rate": 0.00011405017384392655,
         
     | 
| 468 | 
         
            +
                  "loss": 0.049,
         
     | 
| 469 | 
         
            +
                  "mean_token_accuracy": 0.9875269651412963,
         
     | 
| 470 | 
         
            +
                  "num_tokens": 6575211.0,
         
     | 
| 471 | 
         
             
                  "step": 1225
         
     | 
| 472 | 
         
             
                },
         
     | 
| 473 | 
         
             
                {
         
     | 
| 474 | 
         
             
                  "epoch": 3.0,
         
     | 
| 475 | 
         
            +
                  "eval_loss": 0.06446010619401932,
         
     | 
| 476 | 
         
            +
                  "eval_mean_token_accuracy": 0.9841987928828677,
         
     | 
| 477 | 
         
             
                  "eval_num_tokens": 6670539.0,
         
     | 
| 478 | 
         
            +
                  "eval_runtime": 60.4296,
         
     | 
| 479 | 
         
            +
                  "eval_samples_per_second": 6.106,
         
     | 
| 480 | 
         
            +
                  "eval_steps_per_second": 3.061,
         
     | 
| 481 | 
         
             
                  "step": 1245
         
     | 
| 482 | 
         
             
                },
         
     | 
| 483 | 
         
             
                {
         
     | 
| 484 | 
         
             
                  "epoch": 3.012070006035003,
         
     | 
| 485 | 
         
            +
                  "grad_norm": 0.08178732544183731,
         
     | 
| 486 | 
         
            +
                  "learning_rate": 0.00010839430101597464,
         
     | 
| 487 | 
         
            +
                  "loss": 0.0535,
         
     | 
| 488 | 
         
            +
                  "mean_token_accuracy": 0.9864560107594913,
         
     | 
| 489 | 
         
            +
                  "num_tokens": 6706527.0,
         
     | 
| 490 | 
         
             
                  "step": 1250
         
     | 
| 491 | 
         
             
                },
         
     | 
| 492 | 
         
             
                {
         
     | 
| 493 | 
         
             
                  "epoch": 3.0724200362100182,
         
     | 
| 494 | 
         
            +
                  "grad_norm": 0.0654640942811966,
         
     | 
| 495 | 
         
            +
                  "learning_rate": 0.00010280181829493925,
         
     | 
| 496 | 
         
            +
                  "loss": 0.042,
         
     | 
| 497 | 
         
            +
                  "mean_token_accuracy": 0.9891558569669724,
         
     | 
| 498 | 
         
            +
                  "num_tokens": 6845866.0,
         
     | 
| 499 | 
         
             
                  "step": 1275
         
     | 
| 500 | 
         
             
                },
         
     | 
| 501 | 
         
             
                {
         
     | 
| 502 | 
         
             
                  "epoch": 3.1327700663850333,
         
     | 
| 503 | 
         
            +
                  "grad_norm": 0.13900737464427948,
         
     | 
| 504 | 
         
            +
                  "learning_rate": 9.728124634232282e-05,
         
     | 
| 505 | 
         
            +
                  "loss": 0.0496,
         
     | 
| 506 | 
         
            +
                  "mean_token_accuracy": 0.9874085110425949,
         
     | 
| 507 | 
         
            +
                  "num_tokens": 6972947.0,
         
     | 
| 508 | 
         
             
                  "step": 1300
         
     | 
| 509 | 
         
             
                },
         
     | 
| 510 | 
         
             
                {
         
     | 
| 511 | 
         
             
                  "epoch": 3.1931200965600484,
         
     | 
| 512 | 
         
            +
                  "grad_norm": 0.05376769229769707,
         
     | 
| 513 | 
         
            +
                  "learning_rate": 9.184099625696183e-05,
         
     | 
| 514 | 
         
            +
                  "loss": 0.0415,
         
     | 
| 515 | 
         
            +
                  "mean_token_accuracy": 0.9890899294614792,
         
     | 
| 516 | 
         
            +
                  "num_tokens": 7115975.0,
         
     | 
| 517 | 
         
             
                  "step": 1325
         
     | 
| 518 | 
         
             
                },
         
     | 
| 519 | 
         
             
                {
         
     | 
| 520 | 
         
             
                  "epoch": 3.2534701267350634,
         
     | 
| 521 | 
         
            +
                  "grad_norm": 0.12443197518587112,
         
     | 
| 522 | 
         
            +
                  "learning_rate": 8.648935675994459e-05,
         
     | 
| 523 | 
         
            +
                  "loss": 0.0484,
         
     | 
| 524 | 
         
            +
                  "mean_token_accuracy": 0.987565501332283,
         
     | 
| 525 | 
         
            +
                  "num_tokens": 7243324.0,
         
     | 
| 526 | 
         
             
                  "step": 1350
         
     | 
| 527 | 
         
             
                },
         
     | 
| 528 | 
         
             
                {
         
     | 
| 529 | 
         
             
                  "epoch": 3.3138201569100785,
         
     | 
| 530 | 
         
            +
                  "grad_norm": 0.0667525976896286,
         
     | 
| 531 | 
         
            +
                  "learning_rate": 8.123448156598283e-05,
         
     | 
| 532 | 
         
            +
                  "loss": 0.0415,
         
     | 
| 533 | 
         
            +
                  "mean_token_accuracy": 0.9890210199356079,
         
     | 
| 534 | 
         
            +
                  "num_tokens": 7385182.0,
         
     | 
| 535 | 
         
             
                  "step": 1375
         
     | 
| 536 | 
         
             
                },
         
     | 
| 537 | 
         
             
                {
         
     | 
| 538 | 
         
             
                  "epoch": 3.3741701870850935,
         
     | 
| 539 | 
         
            +
                  "grad_norm": 0.12773087620735168,
         
     | 
| 540 | 
         
            +
                  "learning_rate": 7.608437696047756e-05,
         
     | 
| 541 | 
         
            +
                  "loss": 0.0487,
         
     | 
| 542 | 
         
            +
                  "mean_token_accuracy": 0.9873174405097962,
         
     | 
| 543 | 
         
            +
                  "num_tokens": 7509648.0,
         
     | 
| 544 | 
         
             
                  "step": 1400
         
     | 
| 545 | 
         
             
                },
         
     | 
| 546 | 
         
             
                {
         
     | 
| 547 | 
         
             
                  "epoch": 3.4345202172601086,
         
     | 
| 548 | 
         
            +
                  "grad_norm": 0.07510969042778015,
         
     | 
| 549 | 
         
            +
                  "learning_rate": 7.104688960120769e-05,
         
     | 
| 550 | 
         
            +
                  "loss": 0.0403,
         
     | 
| 551 | 
         
            +
                  "mean_token_accuracy": 0.989400810599327,
         
     | 
| 552 | 
         
            +
                  "num_tokens": 7650532.0,
         
     | 
| 553 | 
         
             
                  "step": 1425
         
     | 
| 554 | 
         
             
                },
         
     | 
| 555 | 
         
             
                {
         
     | 
| 556 | 
         
             
                  "epoch": 3.4948702474351236,
         
     | 
| 557 | 
         
            +
                  "grad_norm": 0.24315868318080902,
         
     | 
| 558 | 
         
            +
                  "learning_rate": 6.612969456322507e-05,
         
     | 
| 559 | 
         
            +
                  "loss": 0.0493,
         
     | 
| 560 | 
         
            +
                  "mean_token_accuracy": 0.987003293633461,
         
     | 
| 561 | 
         
            +
                  "num_tokens": 7779847.0,
         
     | 
| 562 | 
         
             
                  "step": 1450
         
     | 
| 563 | 
         
             
                },
         
     | 
| 564 | 
         
             
                {
         
     | 
| 565 | 
         
             
                  "epoch": 3.5552202776101387,
         
     | 
| 566 | 
         
            +
                  "grad_norm": 0.0974864810705185,
         
     | 
| 567 | 
         
            +
                  "learning_rate": 6.134028364517273e-05,
         
     | 
| 568 | 
         
            +
                  "loss": 0.0405,
         
     | 
| 569 | 
         
            +
                  "mean_token_accuracy": 0.9892659622430802,
         
     | 
| 570 | 
         
            +
                  "num_tokens": 7922087.0,
         
     | 
| 571 | 
         
             
                  "step": 1475
         
     | 
| 572 | 
         
             
                },
         
     | 
| 573 | 
         
             
                {
         
     | 
| 574 | 
         
             
                  "epoch": 3.6155703077851538,
         
     | 
| 575 | 
         
            +
                  "grad_norm": 0.112852543592453,
         
     | 
| 576 | 
         
            +
                  "learning_rate": 5.6685953954840553e-05,
         
     | 
| 577 | 
         
            +
                  "loss": 0.0476,
         
     | 
| 578 | 
         
            +
                  "mean_token_accuracy": 0.9879545611143112,
         
     | 
| 579 | 
         
            +
                  "num_tokens": 8049661.0,
         
     | 
| 580 | 
         
             
                  "step": 1500
         
     | 
| 581 | 
         
             
                },
         
     | 
| 582 | 
         
             
                {
         
     | 
| 583 | 
         
             
                  "epoch": 3.675920337960169,
         
     | 
| 584 | 
         
            +
                  "grad_norm": 0.09587077796459198,
         
     | 
| 585 | 
         
            +
                  "learning_rate": 5.2173796791351116e-05,
         
     | 
| 586 | 
         
            +
                  "loss": 0.0399,
         
     | 
| 587 | 
         
            +
                  "mean_token_accuracy": 0.9899050652980804,
         
     | 
| 588 | 
         
            +
                  "num_tokens": 8191357.0,
         
     | 
| 589 | 
         
             
                  "step": 1525
         
     | 
| 590 | 
         
             
                },
         
     | 
| 591 | 
         
             
                {
         
     | 
| 592 | 
         
             
                  "epoch": 3.736270368135184,
         
     | 
| 593 | 
         
            +
                  "grad_norm": 0.15348604321479797,
         
     | 
| 594 | 
         
            +
                  "learning_rate": 4.781068684091327e-05,
         
     | 
| 595 | 
         
            +
                  "loss": 0.047,
         
     | 
| 596 | 
         
            +
                  "mean_token_accuracy": 0.9878348118066788,
         
     | 
| 597 | 
         
            +
                  "num_tokens": 8317709.0,
         
     | 
| 598 | 
         
             
                  "step": 1550
         
     | 
| 599 | 
         
             
                },
         
     | 
| 600 | 
         
             
                {
         
     | 
| 601 | 
         
             
                  "epoch": 3.796620398310199,
         
     | 
| 602 | 
         
            +
                  "grad_norm": 0.10841736942529678,
         
     | 
| 603 | 
         
            +
                  "learning_rate": 4.360327170260604e-05,
         
     | 
| 604 | 
         
            +
                  "loss": 0.0398,
         
     | 
| 605 | 
         
            +
                  "mean_token_accuracy": 0.9894819515943527,
         
     | 
| 606 | 
         
            +
                  "num_tokens": 8460448.0,
         
     | 
| 607 | 
         
             
                  "step": 1575
         
     | 
| 608 | 
         
             
                },
         
     | 
| 609 | 
         
             
                {
         
     | 
| 610 | 
         
             
                  "epoch": 3.856970428485214,
         
     | 
| 611 | 
         
            +
                  "grad_norm": 0.10409346967935562,
         
     | 
| 612 | 
         
            +
                  "learning_rate": 3.955796176015015e-05,
         
     | 
| 613 | 
         
            +
                  "loss": 0.0467,
         
     | 
| 614 | 
         
            +
                  "mean_token_accuracy": 0.9879930222034454,
         
     | 
| 615 | 
         
            +
                  "num_tokens": 8587426.0,
         
     | 
| 616 | 
         
             
                  "step": 1600
         
     | 
| 617 | 
         
             
                },
         
     | 
| 618 | 
         
             
                {
         
     | 
| 619 | 
         
             
                  "epoch": 3.9173204586602295,
         
     | 
| 620 | 
         
            +
                  "grad_norm": 0.08520140498876572,
         
     | 
| 621 | 
         
            +
                  "learning_rate": 3.5680920415099366e-05,
         
     | 
| 622 | 
         
            +
                  "loss": 0.0406,
         
     | 
| 623 | 
         
            +
                  "mean_token_accuracy": 0.9894054895639419,
         
     | 
| 624 | 
         
            +
                  "num_tokens": 8728471.0,
         
     | 
| 625 | 
         
             
                  "step": 1625
         
     | 
| 626 | 
         
             
                },
         
     | 
| 627 | 
         
             
                {
         
     | 
| 628 | 
         
             
                  "epoch": 3.9776704888352445,
         
     | 
| 629 | 
         
            +
                  "grad_norm": 0.10589835047721863,
         
     | 
| 630 | 
         
            +
                  "learning_rate": 3.197805469633152e-05,
         
     | 
| 631 | 
         
            +
                  "loss": 0.0458,
         
     | 
| 632 | 
         
            +
                  "mean_token_accuracy": 0.9883326524496079,
         
     | 
| 633 | 
         
            +
                  "num_tokens": 8850332.0,
         
     | 
| 634 | 
         
             
                  "step": 1650
         
     | 
| 635 | 
         
             
                },
         
     | 
| 636 | 
         
             
                {
         
     | 
| 637 | 
         
             
                  "epoch": 4.0,
         
     | 
| 638 | 
         
            +
                  "eval_loss": 0.06247411295771599,
         
     | 
| 639 | 
         
            +
                  "eval_mean_token_accuracy": 0.9854503702472996,
         
     | 
| 640 | 
         
             
                  "eval_num_tokens": 8894052.0,
         
     | 
| 641 | 
         
            +
                  "eval_runtime": 60.5535,
         
     | 
| 642 | 
         
            +
                  "eval_samples_per_second": 6.094,
         
     | 
| 643 | 
         
            +
                  "eval_steps_per_second": 3.055,
         
     | 
| 644 | 
         
             
                  "step": 1660
         
     | 
| 645 | 
         
             
                }
         
     | 
| 646 | 
         
             
              ],
         
     | 
| 647 | 
         
             
              "logging_steps": 25,
         
     | 
| 648 | 
         
            +
              "max_steps": 2075,
         
     | 
| 649 | 
         
             
              "num_input_tokens_seen": 0,
         
     | 
| 650 | 
         
            +
              "num_train_epochs": 5,
         
     | 
| 651 | 
         
             
              "save_steps": 500,
         
     | 
| 652 | 
         
             
              "stateful_callbacks": {
         
     | 
| 653 | 
         
             
                "TrainerControl": {
         
     | 
| 
         | 
|
| 661 | 
         
             
                  "attributes": {}
         
     | 
| 662 | 
         
             
                }
         
     | 
| 663 | 
         
             
              },
         
     | 
| 664 | 
         
            +
              "total_flos": 3.8465425707075994e+17,
         
     | 
| 665 | 
         
             
              "train_batch_size": 2,
         
     | 
| 666 | 
         
             
              "trial_name": null,
         
     | 
| 667 | 
         
             
              "trial_params": null
         
     | 
    	
        checkpoint-1660/training_args.bin
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 6033
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:51ece4ed6b1462de05ca804e04b783f884883c31cae5c545b5f19f6192d34a62
         
     | 
| 3 | 
         
             
            size 6033
         
     | 
    	
        checkpoint-2075/adapter_config.json
    CHANGED
    
    | 
         @@ -25,12 +25,12 @@ 
     | 
|
| 25 | 
         
             
              "revision": null,
         
     | 
| 26 | 
         
             
              "target_modules": [
         
     | 
| 27 | 
         
             
                "gate_proj",
         
     | 
| 28 | 
         
            -
                " 
     | 
| 
         | 
|
| 29 | 
         
             
                "down_proj",
         
     | 
| 30 | 
         
             
                "o_proj",
         
     | 
| 31 | 
         
            -
                " 
     | 
| 32 | 
         
            -
                "up_proj" 
     | 
| 33 | 
         
            -
                "q_proj"
         
     | 
| 34 | 
         
             
              ],
         
     | 
| 35 | 
         
             
              "task_type": "CAUSAL_LM",
         
     | 
| 36 | 
         
             
              "trainable_token_indices": null,
         
     | 
| 
         | 
|
| 25 | 
         
             
              "revision": null,
         
     | 
| 26 | 
         
             
              "target_modules": [
         
     | 
| 27 | 
         
             
                "gate_proj",
         
     | 
| 28 | 
         
            +
                "v_proj",
         
     | 
| 29 | 
         
            +
                "q_proj",
         
     | 
| 30 | 
         
             
                "down_proj",
         
     | 
| 31 | 
         
             
                "o_proj",
         
     | 
| 32 | 
         
            +
                "k_proj",
         
     | 
| 33 | 
         
            +
                "up_proj"
         
     | 
| 
         | 
|
| 34 | 
         
             
              ],
         
     | 
| 35 | 
         
             
              "task_type": "CAUSAL_LM",
         
     | 
| 36 | 
         
             
              "trainable_token_indices": null,
         
     | 
    	
        checkpoint-2075/adapter_model.safetensors
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 335604696
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:9f5defc89229b64935bf3a4cdd33bed60f970fb87012f2182df603c88c1df0f6
         
     | 
| 3 | 
         
             
            size 335604696
         
     | 
    	
        checkpoint-2075/optimizer.pt
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 671365003
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:4788991da9dc64100ff7a138b64cd2735fb818098ab3f26ce615ca8384ed5ce0
         
     | 
| 3 | 
         
             
            size 671365003
         
     | 
    	
        checkpoint-2075/rng_state.pth
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 14645
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:a84f518cefb4f56ec5c311005d8468b4b70f531937c727c948a9343bb611eb36
         
     | 
| 3 | 
         
             
            size 14645
         
     | 
    	
        checkpoint-2075/scheduler.pt
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 1465
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:9800b41968ee3ebe37d1b494ee7ae782a6ac9a2a1c60981b79b83606f190f395
         
     | 
| 3 | 
         
             
            size 1465
         
     | 
    	
        checkpoint-2075/trainer_state.json
    CHANGED
    
    | 
         @@ -11,806 +11,806 @@ 
     | 
|
| 11 | 
         
             
              "log_history": [
         
     | 
| 12 | 
         
             
                {
         
     | 
| 13 | 
         
             
                  "epoch": 0.060350030175015085,
         
     | 
| 14 | 
         
            -
                  "grad_norm": 0. 
     | 
| 15 | 
         
            -
                  "learning_rate":  
     | 
| 16 | 
         
            -
                  "loss": 1. 
     | 
| 17 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 18 | 
         
            -
                  "num_tokens":  
     | 
| 19 | 
         
             
                  "step": 25
         
     | 
| 20 | 
         
             
                },
         
     | 
| 21 | 
         
             
                {
         
     | 
| 22 | 
         
             
                  "epoch": 0.12070006035003017,
         
     | 
| 23 | 
         
            -
                  "grad_norm": 0. 
     | 
| 24 | 
         
            -
                  "learning_rate": 0. 
     | 
| 25 | 
         
            -
                  "loss": 0. 
     | 
| 26 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 27 | 
         
            -
                  "num_tokens":  
     | 
| 28 | 
         
             
                  "step": 50
         
     | 
| 29 | 
         
             
                },
         
     | 
| 30 | 
         
             
                {
         
     | 
| 31 | 
         
             
                  "epoch": 0.18105009052504525,
         
     | 
| 32 | 
         
            -
                  "grad_norm": 0. 
     | 
| 33 | 
         
            -
                  "learning_rate": 0. 
     | 
| 34 | 
         
            -
                  "loss": 0. 
     | 
| 35 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 36 | 
         
            -
                  "num_tokens":  
     | 
| 37 | 
         
             
                  "step": 75
         
     | 
| 38 | 
         
             
                },
         
     | 
| 39 | 
         
             
                {
         
     | 
| 40 | 
         
             
                  "epoch": 0.24140012070006034,
         
     | 
| 41 | 
         
            -
                  "grad_norm": 0. 
     | 
| 42 | 
         
            -
                  "learning_rate": 0. 
     | 
| 43 | 
         
            -
                  "loss": 0. 
     | 
| 44 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 45 | 
         
            -
                  "num_tokens":  
     | 
| 46 | 
         
             
                  "step": 100
         
     | 
| 47 | 
         
             
                },
         
     | 
| 48 | 
         
             
                {
         
     | 
| 49 | 
         
             
                  "epoch": 0.30175015087507545,
         
     | 
| 50 | 
         
            -
                  "grad_norm": 0. 
     | 
| 51 | 
         
            -
                  "learning_rate": 0. 
     | 
| 52 | 
         
            -
                  "loss": 0. 
     | 
| 53 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 54 | 
         
            -
                  "num_tokens":  
     | 
| 55 | 
         
             
                  "step": 125
         
     | 
| 56 | 
         
             
                },
         
     | 
| 57 | 
         
             
                {
         
     | 
| 58 | 
         
             
                  "epoch": 0.3621001810500905,
         
     | 
| 59 | 
         
            -
                  "grad_norm":  
     | 
| 60 | 
         
            -
                  "learning_rate": 0. 
     | 
| 61 | 
         
            -
                  "loss": 0. 
     | 
| 62 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 63 | 
         
            -
                  "num_tokens":  
     | 
| 64 | 
         
             
                  "step": 150
         
     | 
| 65 | 
         
             
                },
         
     | 
| 66 | 
         
             
                {
         
     | 
| 67 | 
         
             
                  "epoch": 0.4224502112251056,
         
     | 
| 68 | 
         
            -
                  "grad_norm": 0. 
     | 
| 69 | 
         
            -
                  "learning_rate": 0. 
     | 
| 70 | 
         
            -
                  "loss": 0. 
     | 
| 71 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 72 | 
         
            -
                  "num_tokens":  
     | 
| 73 | 
         
             
                  "step": 175
         
     | 
| 74 | 
         
             
                },
         
     | 
| 75 | 
         
             
                {
         
     | 
| 76 | 
         
             
                  "epoch": 0.4828002414001207,
         
     | 
| 77 | 
         
            -
                  "grad_norm": 0. 
     | 
| 78 | 
         
            -
                  "learning_rate": 0. 
     | 
| 79 | 
         
            -
                  "loss": 0. 
     | 
| 80 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 81 | 
         
            -
                  "num_tokens":  
     | 
| 82 | 
         
             
                  "step": 200
         
     | 
| 83 | 
         
             
                },
         
     | 
| 84 | 
         
             
                {
         
     | 
| 85 | 
         
             
                  "epoch": 0.5431502715751357,
         
     | 
| 86 | 
         
            -
                  "grad_norm": 0. 
     | 
| 87 | 
         
            -
                  "learning_rate": 0. 
     | 
| 88 | 
         
            -
                  "loss": 0. 
     | 
| 89 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 90 | 
         
            -
                  "num_tokens":  
     | 
| 91 | 
         
             
                  "step": 225
         
     | 
| 92 | 
         
             
                },
         
     | 
| 93 | 
         
             
                {
         
     | 
| 94 | 
         
             
                  "epoch": 0.6035003017501509,
         
     | 
| 95 | 
         
            -
                  "grad_norm": 0. 
     | 
| 96 | 
         
            -
                  "learning_rate": 0. 
     | 
| 97 | 
         
            -
                  "loss": 0. 
     | 
| 98 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 99 | 
         
            -
                  "num_tokens":  
     | 
| 100 | 
         
             
                  "step": 250
         
     | 
| 101 | 
         
             
                },
         
     | 
| 102 | 
         
             
                {
         
     | 
| 103 | 
         
             
                  "epoch": 0.663850331925166,
         
     | 
| 104 | 
         
            -
                  "grad_norm": 0. 
     | 
| 105 | 
         
            -
                  "learning_rate": 0. 
     | 
| 106 | 
         
            -
                  "loss": 0. 
     | 
| 107 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 108 | 
         
            -
                  "num_tokens":  
     | 
| 109 | 
         
             
                  "step": 275
         
     | 
| 110 | 
         
             
                },
         
     | 
| 111 | 
         
             
                {
         
     | 
| 112 | 
         
             
                  "epoch": 0.724200362100181,
         
     | 
| 113 | 
         
            -
                  "grad_norm": 0. 
     | 
| 114 | 
         
            -
                  "learning_rate": 0. 
     | 
| 115 | 
         
            -
                  "loss": 0. 
     | 
| 116 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 117 | 
         
            -
                  "num_tokens":  
     | 
| 118 | 
         
             
                  "step": 300
         
     | 
| 119 | 
         
             
                },
         
     | 
| 120 | 
         
             
                {
         
     | 
| 121 | 
         
             
                  "epoch": 0.7845503922751962,
         
     | 
| 122 | 
         
            -
                  "grad_norm": 0. 
     | 
| 123 | 
         
            -
                  "learning_rate": 0. 
     | 
| 124 | 
         
            -
                  "loss": 0. 
     | 
| 125 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 126 | 
         
            -
                  "num_tokens":  
     | 
| 127 | 
         
             
                  "step": 325
         
     | 
| 128 | 
         
             
                },
         
     | 
| 129 | 
         
             
                {
         
     | 
| 130 | 
         
             
                  "epoch": 0.8449004224502112,
         
     | 
| 131 | 
         
            -
                  "grad_norm": 0. 
     | 
| 132 | 
         
            -
                  "learning_rate": 0. 
     | 
| 133 | 
         
            -
                  "loss": 0. 
     | 
| 134 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 135 | 
         
            -
                  "num_tokens":  
     | 
| 136 | 
         
             
                  "step": 350
         
     | 
| 137 | 
         
             
                },
         
     | 
| 138 | 
         
             
                {
         
     | 
| 139 | 
         
             
                  "epoch": 0.9052504526252263,
         
     | 
| 140 | 
         
            -
                  "grad_norm": 0. 
     | 
| 141 | 
         
            -
                  "learning_rate": 0. 
     | 
| 142 | 
         
            -
                  "loss": 0. 
     | 
| 143 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 144 | 
         
            -
                  "num_tokens":  
     | 
| 145 | 
         
             
                  "step": 375
         
     | 
| 146 | 
         
             
                },
         
     | 
| 147 | 
         
             
                {
         
     | 
| 148 | 
         
             
                  "epoch": 0.9656004828002414,
         
     | 
| 149 | 
         
            -
                  "grad_norm": 0. 
     | 
| 150 | 
         
            -
                  "learning_rate": 0. 
     | 
| 151 | 
         
            -
                  "loss": 0. 
     | 
| 152 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 153 | 
         
            -
                  "num_tokens":  
     | 
| 154 | 
         
             
                  "step": 400
         
     | 
| 155 | 
         
             
                },
         
     | 
| 156 | 
         
             
                {
         
     | 
| 157 | 
         
             
                  "epoch": 1.0,
         
     | 
| 158 | 
         
            -
                  "eval_loss": 0. 
     | 
| 159 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 160 | 
         
             
                  "eval_num_tokens": 2223513.0,
         
     | 
| 161 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 162 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 163 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 164 | 
         
             
                  "step": 415
         
     | 
| 165 | 
         
             
                },
         
     | 
| 166 | 
         
             
                {
         
     | 
| 167 | 
         
             
                  "epoch": 1.024140012070006,
         
     | 
| 168 | 
         
            -
                  "grad_norm": 0. 
     | 
| 169 | 
         
            -
                  "learning_rate": 0. 
     | 
| 170 | 
         
            -
                  "loss": 0. 
     | 
| 171 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 172 | 
         
            -
                  "num_tokens":  
     | 
| 173 | 
         
             
                  "step": 425
         
     | 
| 174 | 
         
             
                },
         
     | 
| 175 | 
         
             
                {
         
     | 
| 176 | 
         
             
                  "epoch": 1.0844900422450212,
         
     | 
| 177 | 
         
            -
                  "grad_norm": 0. 
     | 
| 178 | 
         
            -
                  "learning_rate": 0. 
     | 
| 179 | 
         
            -
                  "loss": 0. 
     | 
| 180 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 181 | 
         
            -
                  "num_tokens":  
     | 
| 182 | 
         
             
                  "step": 450
         
     | 
| 183 | 
         
             
                },
         
     | 
| 184 | 
         
             
                {
         
     | 
| 185 | 
         
             
                  "epoch": 1.1448400724200363,
         
     | 
| 186 | 
         
            -
                  "grad_norm": 0. 
     | 
| 187 | 
         
            -
                  "learning_rate": 0. 
     | 
| 188 | 
         
            -
                  "loss": 0. 
     | 
| 189 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 190 | 
         
            -
                  "num_tokens":  
     | 
| 191 | 
         
             
                  "step": 475
         
     | 
| 192 | 
         
             
                },
         
     | 
| 193 | 
         
             
                {
         
     | 
| 194 | 
         
             
                  "epoch": 1.2051901025950513,
         
     | 
| 195 | 
         
            -
                  "grad_norm": 0. 
     | 
| 196 | 
         
            -
                  "learning_rate": 0. 
     | 
| 197 | 
         
            -
                  "loss": 0. 
     | 
| 198 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 199 | 
         
            -
                  "num_tokens":  
     | 
| 200 | 
         
             
                  "step": 500
         
     | 
| 201 | 
         
             
                },
         
     | 
| 202 | 
         
             
                {
         
     | 
| 203 | 
         
             
                  "epoch": 1.2655401327700664,
         
     | 
| 204 | 
         
            -
                  "grad_norm": 0. 
     | 
| 205 | 
         
            -
                  "learning_rate": 0. 
     | 
| 206 | 
         
            -
                  "loss": 0. 
     | 
| 207 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 208 | 
         
            -
                  "num_tokens":  
     | 
| 209 | 
         
             
                  "step": 525
         
     | 
| 210 | 
         
             
                },
         
     | 
| 211 | 
         
             
                {
         
     | 
| 212 | 
         
             
                  "epoch": 1.3258901629450814,
         
     | 
| 213 | 
         
            -
                  "grad_norm": 0. 
     | 
| 214 | 
         
            -
                  "learning_rate": 0. 
     | 
| 215 | 
         
            -
                  "loss": 0. 
     | 
| 216 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 217 | 
         
            -
                  "num_tokens":  
     | 
| 218 | 
         
             
                  "step": 550
         
     | 
| 219 | 
         
             
                },
         
     | 
| 220 | 
         
             
                {
         
     | 
| 221 | 
         
             
                  "epoch": 1.3862401931200965,
         
     | 
| 222 | 
         
            -
                  "grad_norm": 0. 
     | 
| 223 | 
         
            -
                  "learning_rate": 0. 
     | 
| 224 | 
         
            -
                  "loss": 0. 
     | 
| 225 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 226 | 
         
            -
                  "num_tokens":  
     | 
| 227 | 
         
             
                  "step": 575
         
     | 
| 228 | 
         
             
                },
         
     | 
| 229 | 
         
             
                {
         
     | 
| 230 | 
         
             
                  "epoch": 1.4465902232951118,
         
     | 
| 231 | 
         
            -
                  "grad_norm": 0. 
     | 
| 232 | 
         
            -
                  "learning_rate": 0. 
     | 
| 233 | 
         
            -
                  "loss": 0. 
     | 
| 234 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 235 | 
         
            -
                  "num_tokens":  
     | 
| 236 | 
         
             
                  "step": 600
         
     | 
| 237 | 
         
             
                },
         
     | 
| 238 | 
         
             
                {
         
     | 
| 239 | 
         
             
                  "epoch": 1.5069402534701268,
         
     | 
| 240 | 
         
            -
                  "grad_norm": 0. 
     | 
| 241 | 
         
            -
                  "learning_rate": 0. 
     | 
| 242 | 
         
            -
                  "loss": 0. 
     | 
| 243 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 244 | 
         
            -
                  "num_tokens":  
     | 
| 245 | 
         
             
                  "step": 625
         
     | 
| 246 | 
         
             
                },
         
     | 
| 247 | 
         
             
                {
         
     | 
| 248 | 
         
             
                  "epoch": 1.567290283645142,
         
     | 
| 249 | 
         
            -
                  "grad_norm": 0. 
     | 
| 250 | 
         
            -
                  "learning_rate": 0. 
     | 
| 251 | 
         
            -
                  "loss": 0. 
     | 
| 252 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 253 | 
         
            -
                  "num_tokens":  
     | 
| 254 | 
         
             
                  "step": 650
         
     | 
| 255 | 
         
             
                },
         
     | 
| 256 | 
         
             
                {
         
     | 
| 257 | 
         
             
                  "epoch": 1.627640313820157,
         
     | 
| 258 | 
         
            -
                  "grad_norm": 0. 
     | 
| 259 | 
         
            -
                  "learning_rate": 0. 
     | 
| 260 | 
         
            -
                  "loss": 0. 
     | 
| 261 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 262 | 
         
            -
                  "num_tokens":  
     | 
| 263 | 
         
             
                  "step": 675
         
     | 
| 264 | 
         
             
                },
         
     | 
| 265 | 
         
             
                {
         
     | 
| 266 | 
         
             
                  "epoch": 1.687990343995172,
         
     | 
| 267 | 
         
            -
                  "grad_norm": 0. 
     | 
| 268 | 
         
            -
                  "learning_rate": 0. 
     | 
| 269 | 
         
            -
                  "loss": 0. 
     | 
| 270 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 271 | 
         
            -
                  "num_tokens":  
     | 
| 272 | 
         
             
                  "step": 700
         
     | 
| 273 | 
         
             
                },
         
     | 
| 274 | 
         
             
                {
         
     | 
| 275 | 
         
             
                  "epoch": 1.748340374170187,
         
     | 
| 276 | 
         
            -
                  "grad_norm": 0. 
     | 
| 277 | 
         
            -
                  "learning_rate": 0. 
     | 
| 278 | 
         
            -
                  "loss": 0. 
     | 
| 279 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 280 | 
         
            -
                  "num_tokens":  
     | 
| 281 | 
         
             
                  "step": 725
         
     | 
| 282 | 
         
             
                },
         
     | 
| 283 | 
         
             
                {
         
     | 
| 284 | 
         
             
                  "epoch": 1.8086904043452021,
         
     | 
| 285 | 
         
            -
                  "grad_norm": 0. 
     | 
| 286 | 
         
            -
                  "learning_rate": 0. 
     | 
| 287 | 
         
            -
                  "loss": 0. 
     | 
| 288 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 289 | 
         
            -
                  "num_tokens":  
     | 
| 290 | 
         
             
                  "step": 750
         
     | 
| 291 | 
         
             
                },
         
     | 
| 292 | 
         
             
                {
         
     | 
| 293 | 
         
             
                  "epoch": 1.8690404345202172,
         
     | 
| 294 | 
         
            -
                  "grad_norm": 0. 
     | 
| 295 | 
         
            -
                  "learning_rate": 0. 
     | 
| 296 | 
         
            -
                  "loss": 0. 
     | 
| 297 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 298 | 
         
            -
                  "num_tokens":  
     | 
| 299 | 
         
             
                  "step": 775
         
     | 
| 300 | 
         
             
                },
         
     | 
| 301 | 
         
             
                {
         
     | 
| 302 | 
         
             
                  "epoch": 1.9293904646952322,
         
     | 
| 303 | 
         
            -
                  "grad_norm": 0. 
     | 
| 304 | 
         
            -
                  "learning_rate": 0. 
     | 
| 305 | 
         
            -
                  "loss": 0. 
     | 
| 306 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 307 | 
         
            -
                  "num_tokens":  
     | 
| 308 | 
         
             
                  "step": 800
         
     | 
| 309 | 
         
             
                },
         
     | 
| 310 | 
         
             
                {
         
     | 
| 311 | 
         
             
                  "epoch": 1.9897404948702473,
         
     | 
| 312 | 
         
            -
                  "grad_norm": 0. 
     | 
| 313 | 
         
            -
                  "learning_rate": 0. 
     | 
| 314 | 
         
            -
                  "loss": 0. 
     | 
| 315 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 316 | 
         
            -
                  "num_tokens":  
     | 
| 317 | 
         
             
                  "step": 825
         
     | 
| 318 | 
         
             
                },
         
     | 
| 319 | 
         
             
                {
         
     | 
| 320 | 
         
             
                  "epoch": 2.0,
         
     | 
| 321 | 
         
            -
                  "eval_loss": 0. 
     | 
| 322 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 323 | 
         
             
                  "eval_num_tokens": 4447026.0,
         
     | 
| 324 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 325 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 326 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 327 | 
         
             
                  "step": 830
         
     | 
| 328 | 
         
             
                },
         
     | 
| 329 | 
         
             
                {
         
     | 
| 330 | 
         
             
                  "epoch": 2.048280024140012,
         
     | 
| 331 | 
         
            -
                  "grad_norm": 0. 
     | 
| 332 | 
         
            -
                  "learning_rate": 0. 
     | 
| 333 | 
         
            -
                  "loss": 0. 
     | 
| 334 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 335 | 
         
            -
                  "num_tokens":  
     | 
| 336 | 
         
             
                  "step": 850
         
     | 
| 337 | 
         
             
                },
         
     | 
| 338 | 
         
             
                {
         
     | 
| 339 | 
         
             
                  "epoch": 2.1086300543150274,
         
     | 
| 340 | 
         
            -
                  "grad_norm": 0. 
     | 
| 341 | 
         
            -
                  "learning_rate": 0. 
     | 
| 342 | 
         
            -
                  "loss": 0. 
     | 
| 343 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 344 | 
         
            -
                  "num_tokens":  
     | 
| 345 | 
         
             
                  "step": 875
         
     | 
| 346 | 
         
             
                },
         
     | 
| 347 | 
         
             
                {
         
     | 
| 348 | 
         
             
                  "epoch": 2.1689800844900424,
         
     | 
| 349 | 
         
            -
                  "grad_norm": 0. 
     | 
| 350 | 
         
            -
                  "learning_rate": 0. 
     | 
| 351 | 
         
            -
                  "loss": 0. 
     | 
| 352 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 353 | 
         
            -
                  "num_tokens":  
     | 
| 354 | 
         
             
                  "step": 900
         
     | 
| 355 | 
         
             
                },
         
     | 
| 356 | 
         
             
                {
         
     | 
| 357 | 
         
             
                  "epoch": 2.2293301146650575,
         
     | 
| 358 | 
         
            -
                  "grad_norm": 0. 
     | 
| 359 | 
         
            -
                  "learning_rate": 0. 
     | 
| 360 | 
         
            -
                  "loss": 0. 
     | 
| 361 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 362 | 
         
            -
                  "num_tokens":  
     | 
| 363 | 
         
             
                  "step": 925
         
     | 
| 364 | 
         
             
                },
         
     | 
| 365 | 
         
             
                {
         
     | 
| 366 | 
         
             
                  "epoch": 2.2896801448400725,
         
     | 
| 367 | 
         
            -
                  "grad_norm": 0. 
     | 
| 368 | 
         
            -
                  "learning_rate": 0. 
     | 
| 369 | 
         
            -
                  "loss": 0. 
     | 
| 370 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 371 | 
         
            -
                  "num_tokens":  
     | 
| 372 | 
         
             
                  "step": 950
         
     | 
| 373 | 
         
             
                },
         
     | 
| 374 | 
         
             
                {
         
     | 
| 375 | 
         
             
                  "epoch": 2.3500301750150876,
         
     | 
| 376 | 
         
            -
                  "grad_norm": 0. 
     | 
| 377 | 
         
            -
                  "learning_rate": 0. 
     | 
| 378 | 
         
            -
                  "loss": 0. 
     | 
| 379 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 380 | 
         
            -
                  "num_tokens":  
     | 
| 381 | 
         
             
                  "step": 975
         
     | 
| 382 | 
         
             
                },
         
     | 
| 383 | 
         
             
                {
         
     | 
| 384 | 
         
             
                  "epoch": 2.4103802051901027,
         
     | 
| 385 | 
         
            -
                  "grad_norm": 0. 
     | 
| 386 | 
         
            -
                  "learning_rate": 0. 
     | 
| 387 | 
         
            -
                  "loss": 0. 
     | 
| 388 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 389 | 
         
            -
                  "num_tokens":  
     | 
| 390 | 
         
             
                  "step": 1000
         
     | 
| 391 | 
         
             
                },
         
     | 
| 392 | 
         
             
                {
         
     | 
| 393 | 
         
             
                  "epoch": 2.4707302353651177,
         
     | 
| 394 | 
         
            -
                  "grad_norm": 0. 
     | 
| 395 | 
         
            -
                  "learning_rate": 0. 
     | 
| 396 | 
         
            -
                  "loss": 0. 
     | 
| 397 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 398 | 
         
            -
                  "num_tokens":  
     | 
| 399 | 
         
             
                  "step": 1025
         
     | 
| 400 | 
         
             
                },
         
     | 
| 401 | 
         
             
                {
         
     | 
| 402 | 
         
             
                  "epoch": 2.5310802655401328,
         
     | 
| 403 | 
         
            -
                  "grad_norm": 0. 
     | 
| 404 | 
         
            -
                  "learning_rate": 0. 
     | 
| 405 | 
         
            -
                  "loss": 0. 
     | 
| 406 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 407 | 
         
            -
                  "num_tokens":  
     | 
| 408 | 
         
             
                  "step": 1050
         
     | 
| 409 | 
         
             
                },
         
     | 
| 410 | 
         
             
                {
         
     | 
| 411 | 
         
             
                  "epoch": 2.591430295715148,
         
     | 
| 412 | 
         
            -
                  "grad_norm": 0. 
     | 
| 413 | 
         
            -
                  "learning_rate": 0. 
     | 
| 414 | 
         
            -
                  "loss": 0. 
     | 
| 415 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 416 | 
         
            -
                  "num_tokens":  
     | 
| 417 | 
         
             
                  "step": 1075
         
     | 
| 418 | 
         
             
                },
         
     | 
| 419 | 
         
             
                {
         
     | 
| 420 | 
         
             
                  "epoch": 2.651780325890163,
         
     | 
| 421 | 
         
            -
                  "grad_norm": 0. 
     | 
| 422 | 
         
            -
                  "learning_rate": 0. 
     | 
| 423 | 
         
            -
                  "loss": 0. 
     | 
| 424 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 425 | 
         
            -
                  "num_tokens":  
     | 
| 426 | 
         
             
                  "step": 1100
         
     | 
| 427 | 
         
             
                },
         
     | 
| 428 | 
         
             
                {
         
     | 
| 429 | 
         
             
                  "epoch": 2.712130356065178,
         
     | 
| 430 | 
         
            -
                  "grad_norm": 0. 
     | 
| 431 | 
         
            -
                  "learning_rate": 0. 
     | 
| 432 | 
         
            -
                  "loss": 0. 
     | 
| 433 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 434 | 
         
            -
                  "num_tokens":  
     | 
| 435 | 
         
             
                  "step": 1125
         
     | 
| 436 | 
         
             
                },
         
     | 
| 437 | 
         
             
                {
         
     | 
| 438 | 
         
             
                  "epoch": 2.772480386240193,
         
     | 
| 439 | 
         
            -
                  "grad_norm": 0. 
     | 
| 440 | 
         
            -
                  "learning_rate": 0. 
     | 
| 441 | 
         
            -
                  "loss": 0. 
     | 
| 442 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 443 | 
         
            -
                  "num_tokens":  
     | 
| 444 | 
         
             
                  "step": 1150
         
     | 
| 445 | 
         
             
                },
         
     | 
| 446 | 
         
             
                {
         
     | 
| 447 | 
         
             
                  "epoch": 2.832830416415208,
         
     | 
| 448 | 
         
            -
                  "grad_norm": 0. 
     | 
| 449 | 
         
            -
                  "learning_rate": 0. 
     | 
| 450 | 
         
            -
                  "loss": 0. 
     | 
| 451 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 452 | 
         
            -
                  "num_tokens":  
     | 
| 453 | 
         
             
                  "step": 1175
         
     | 
| 454 | 
         
             
                },
         
     | 
| 455 | 
         
             
                {
         
     | 
| 456 | 
         
             
                  "epoch": 2.8931804465902236,
         
     | 
| 457 | 
         
            -
                  "grad_norm": 0. 
     | 
| 458 | 
         
            -
                  "learning_rate": 0. 
     | 
| 459 | 
         
            -
                  "loss": 0. 
     | 
| 460 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 461 | 
         
            -
                  "num_tokens":  
     | 
| 462 | 
         
             
                  "step": 1200
         
     | 
| 463 | 
         
             
                },
         
     | 
| 464 | 
         
             
                {
         
     | 
| 465 | 
         
             
                  "epoch": 2.9535304767652386,
         
     | 
| 466 | 
         
            -
                  "grad_norm": 0. 
     | 
| 467 | 
         
            -
                  "learning_rate": 0. 
     | 
| 468 | 
         
            -
                  "loss": 0. 
     | 
| 469 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 470 | 
         
            -
                  "num_tokens":  
     | 
| 471 | 
         
             
                  "step": 1225
         
     | 
| 472 | 
         
             
                },
         
     | 
| 473 | 
         
             
                {
         
     | 
| 474 | 
         
             
                  "epoch": 3.0,
         
     | 
| 475 | 
         
            -
                  "eval_loss": 0. 
     | 
| 476 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 477 | 
         
             
                  "eval_num_tokens": 6670539.0,
         
     | 
| 478 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 479 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 480 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 481 | 
         
             
                  "step": 1245
         
     | 
| 482 | 
         
             
                },
         
     | 
| 483 | 
         
             
                {
         
     | 
| 484 | 
         
             
                  "epoch": 3.012070006035003,
         
     | 
| 485 | 
         
            -
                  "grad_norm": 0. 
     | 
| 486 | 
         
            -
                  "learning_rate": 0. 
     | 
| 487 | 
         
            -
                  "loss": 0. 
     | 
| 488 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 489 | 
         
            -
                  "num_tokens":  
     | 
| 490 | 
         
             
                  "step": 1250
         
     | 
| 491 | 
         
             
                },
         
     | 
| 492 | 
         
             
                {
         
     | 
| 493 | 
         
             
                  "epoch": 3.0724200362100182,
         
     | 
| 494 | 
         
            -
                  "grad_norm": 0. 
     | 
| 495 | 
         
            -
                  "learning_rate": 0. 
     | 
| 496 | 
         
            -
                  "loss": 0. 
     | 
| 497 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 498 | 
         
            -
                  "num_tokens":  
     | 
| 499 | 
         
             
                  "step": 1275
         
     | 
| 500 | 
         
             
                },
         
     | 
| 501 | 
         
             
                {
         
     | 
| 502 | 
         
             
                  "epoch": 3.1327700663850333,
         
     | 
| 503 | 
         
            -
                  "grad_norm": 0. 
     | 
| 504 | 
         
            -
                  "learning_rate":  
     | 
| 505 | 
         
            -
                  "loss": 0. 
     | 
| 506 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 507 | 
         
            -
                  "num_tokens":  
     | 
| 508 | 
         
             
                  "step": 1300
         
     | 
| 509 | 
         
             
                },
         
     | 
| 510 | 
         
             
                {
         
     | 
| 511 | 
         
             
                  "epoch": 3.1931200965600484,
         
     | 
| 512 | 
         
            -
                  "grad_norm": 0. 
     | 
| 513 | 
         
            -
                  "learning_rate":  
     | 
| 514 | 
         
            -
                  "loss": 0. 
     | 
| 515 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 516 | 
         
            -
                  "num_tokens":  
     | 
| 517 | 
         
             
                  "step": 1325
         
     | 
| 518 | 
         
             
                },
         
     | 
| 519 | 
         
             
                {
         
     | 
| 520 | 
         
             
                  "epoch": 3.2534701267350634,
         
     | 
| 521 | 
         
            -
                  "grad_norm": 0. 
     | 
| 522 | 
         
            -
                  "learning_rate":  
     | 
| 523 | 
         
            -
                  "loss": 0. 
     | 
| 524 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 525 | 
         
            -
                  "num_tokens":  
     | 
| 526 | 
         
             
                  "step": 1350
         
     | 
| 527 | 
         
             
                },
         
     | 
| 528 | 
         
             
                {
         
     | 
| 529 | 
         
             
                  "epoch": 3.3138201569100785,
         
     | 
| 530 | 
         
            -
                  "grad_norm": 0. 
     | 
| 531 | 
         
            -
                  "learning_rate":  
     | 
| 532 | 
         
            -
                  "loss": 0. 
     | 
| 533 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 534 | 
         
            -
                  "num_tokens":  
     | 
| 535 | 
         
             
                  "step": 1375
         
     | 
| 536 | 
         
             
                },
         
     | 
| 537 | 
         
             
                {
         
     | 
| 538 | 
         
             
                  "epoch": 3.3741701870850935,
         
     | 
| 539 | 
         
            -
                  "grad_norm": 0. 
     | 
| 540 | 
         
            -
                  "learning_rate":  
     | 
| 541 | 
         
            -
                  "loss": 0. 
     | 
| 542 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 543 | 
         
            -
                  "num_tokens":  
     | 
| 544 | 
         
             
                  "step": 1400
         
     | 
| 545 | 
         
             
                },
         
     | 
| 546 | 
         
             
                {
         
     | 
| 547 | 
         
             
                  "epoch": 3.4345202172601086,
         
     | 
| 548 | 
         
            -
                  "grad_norm": 0. 
     | 
| 549 | 
         
            -
                  "learning_rate":  
     | 
| 550 | 
         
            -
                  "loss": 0. 
     | 
| 551 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 552 | 
         
            -
                  "num_tokens":  
     | 
| 553 | 
         
             
                  "step": 1425
         
     | 
| 554 | 
         
             
                },
         
     | 
| 555 | 
         
             
                {
         
     | 
| 556 | 
         
             
                  "epoch": 3.4948702474351236,
         
     | 
| 557 | 
         
            -
                  "grad_norm": 0. 
     | 
| 558 | 
         
            -
                  "learning_rate":  
     | 
| 559 | 
         
            -
                  "loss": 0. 
     | 
| 560 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 561 | 
         
            -
                  "num_tokens":  
     | 
| 562 | 
         
             
                  "step": 1450
         
     | 
| 563 | 
         
             
                },
         
     | 
| 564 | 
         
             
                {
         
     | 
| 565 | 
         
             
                  "epoch": 3.5552202776101387,
         
     | 
| 566 | 
         
            -
                  "grad_norm": 0. 
     | 
| 567 | 
         
            -
                  "learning_rate":  
     | 
| 568 | 
         
            -
                  "loss": 0. 
     | 
| 569 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 570 | 
         
            -
                  "num_tokens":  
     | 
| 571 | 
         
             
                  "step": 1475
         
     | 
| 572 | 
         
             
                },
         
     | 
| 573 | 
         
             
                {
         
     | 
| 574 | 
         
             
                  "epoch": 3.6155703077851538,
         
     | 
| 575 | 
         
            -
                  "grad_norm": 0. 
     | 
| 576 | 
         
            -
                  "learning_rate":  
     | 
| 577 | 
         
            -
                  "loss": 0. 
     | 
| 578 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 579 | 
         
            -
                  "num_tokens":  
     | 
| 580 | 
         
             
                  "step": 1500
         
     | 
| 581 | 
         
             
                },
         
     | 
| 582 | 
         
             
                {
         
     | 
| 583 | 
         
             
                  "epoch": 3.675920337960169,
         
     | 
| 584 | 
         
            -
                  "grad_norm": 0. 
     | 
| 585 | 
         
            -
                  "learning_rate":  
     | 
| 586 | 
         
            -
                  "loss": 0. 
     | 
| 587 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 588 | 
         
            -
                  "num_tokens":  
     | 
| 589 | 
         
             
                  "step": 1525
         
     | 
| 590 | 
         
             
                },
         
     | 
| 591 | 
         
             
                {
         
     | 
| 592 | 
         
             
                  "epoch": 3.736270368135184,
         
     | 
| 593 | 
         
            -
                  "grad_norm": 0. 
     | 
| 594 | 
         
            -
                  "learning_rate":  
     | 
| 595 | 
         
            -
                  "loss": 0. 
     | 
| 596 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 597 | 
         
            -
                  "num_tokens":  
     | 
| 598 | 
         
             
                  "step": 1550
         
     | 
| 599 | 
         
             
                },
         
     | 
| 600 | 
         
             
                {
         
     | 
| 601 | 
         
             
                  "epoch": 3.796620398310199,
         
     | 
| 602 | 
         
            -
                  "grad_norm": 0. 
     | 
| 603 | 
         
            -
                  "learning_rate":  
     | 
| 604 | 
         
            -
                  "loss": 0. 
     | 
| 605 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 606 | 
         
            -
                  "num_tokens":  
     | 
| 607 | 
         
             
                  "step": 1575
         
     | 
| 608 | 
         
             
                },
         
     | 
| 609 | 
         
             
                {
         
     | 
| 610 | 
         
             
                  "epoch": 3.856970428485214,
         
     | 
| 611 | 
         
            -
                  "grad_norm": 0. 
     | 
| 612 | 
         
            -
                  "learning_rate":  
     | 
| 613 | 
         
            -
                  "loss": 0. 
     | 
| 614 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 615 | 
         
            -
                  "num_tokens":  
     | 
| 616 | 
         
             
                  "step": 1600
         
     | 
| 617 | 
         
             
                },
         
     | 
| 618 | 
         
             
                {
         
     | 
| 619 | 
         
             
                  "epoch": 3.9173204586602295,
         
     | 
| 620 | 
         
            -
                  "grad_norm": 0. 
     | 
| 621 | 
         
            -
                  "learning_rate":  
     | 
| 622 | 
         
            -
                  "loss": 0. 
     | 
| 623 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 624 | 
         
            -
                  "num_tokens":  
     | 
| 625 | 
         
             
                  "step": 1625
         
     | 
| 626 | 
         
             
                },
         
     | 
| 627 | 
         
             
                {
         
     | 
| 628 | 
         
             
                  "epoch": 3.9776704888352445,
         
     | 
| 629 | 
         
            -
                  "grad_norm": 0. 
     | 
| 630 | 
         
            -
                  "learning_rate":  
     | 
| 631 | 
         
            -
                  "loss": 0. 
     | 
| 632 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 633 | 
         
            -
                  "num_tokens":  
     | 
| 634 | 
         
             
                  "step": 1650
         
     | 
| 635 | 
         
             
                },
         
     | 
| 636 | 
         
             
                {
         
     | 
| 637 | 
         
             
                  "epoch": 4.0,
         
     | 
| 638 | 
         
            -
                  "eval_loss": 0. 
     | 
| 639 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 640 | 
         
             
                  "eval_num_tokens": 8894052.0,
         
     | 
| 641 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 642 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 643 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 644 | 
         
             
                  "step": 1660
         
     | 
| 645 | 
         
             
                },
         
     | 
| 646 | 
         
             
                {
         
     | 
| 647 | 
         
             
                  "epoch": 4.036210018105009,
         
     | 
| 648 | 
         
            -
                  "grad_norm": 0. 
     | 
| 649 | 
         
            -
                  "learning_rate":  
     | 
| 650 | 
         
            -
                  "loss": 0. 
     | 
| 651 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 652 | 
         
            -
                  "num_tokens":  
     | 
| 653 | 
         
             
                  "step": 1675
         
     | 
| 654 | 
         
             
                },
         
     | 
| 655 | 
         
             
                {
         
     | 
| 656 | 
         
             
                  "epoch": 4.096560048280024,
         
     | 
| 657 | 
         
            -
                  "grad_norm": 0. 
     | 
| 658 | 
         
            -
                  "learning_rate":  
     | 
| 659 | 
         
            -
                  "loss": 0. 
     | 
| 660 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 661 | 
         
            -
                  "num_tokens":  
     | 
| 662 | 
         
             
                  "step": 1700
         
     | 
| 663 | 
         
             
                },
         
     | 
| 664 | 
         
             
                {
         
     | 
| 665 | 
         
             
                  "epoch": 4.15691007845504,
         
     | 
| 666 | 
         
            -
                  "grad_norm": 0. 
     | 
| 667 | 
         
            -
                  "learning_rate":  
     | 
| 668 | 
         
            -
                  "loss": 0. 
     | 
| 669 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 670 | 
         
            -
                  "num_tokens":  
     | 
| 671 | 
         
             
                  "step": 1725
         
     | 
| 672 | 
         
             
                },
         
     | 
| 673 | 
         
             
                {
         
     | 
| 674 | 
         
             
                  "epoch": 4.217260108630055,
         
     | 
| 675 | 
         
            -
                  "grad_norm": 0. 
     | 
| 676 | 
         
            -
                  "learning_rate":  
     | 
| 677 | 
         
            -
                  "loss": 0. 
     | 
| 678 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 679 | 
         
            -
                  "num_tokens":  
     | 
| 680 | 
         
             
                  "step": 1750
         
     | 
| 681 | 
         
             
                },
         
     | 
| 682 | 
         
             
                {
         
     | 
| 683 | 
         
             
                  "epoch": 4.27761013880507,
         
     | 
| 684 | 
         
            -
                  "grad_norm": 0. 
     | 
| 685 | 
         
            -
                  "learning_rate":  
     | 
| 686 | 
         
            -
                  "loss": 0. 
     | 
| 687 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 688 | 
         
            -
                  "num_tokens":  
     | 
| 689 | 
         
             
                  "step": 1775
         
     | 
| 690 | 
         
             
                },
         
     | 
| 691 | 
         
             
                {
         
     | 
| 692 | 
         
             
                  "epoch": 4.337960168980085,
         
     | 
| 693 | 
         
            -
                  "grad_norm": 0. 
     | 
| 694 | 
         
            -
                  "learning_rate":  
     | 
| 695 | 
         
            -
                  "loss": 0. 
     | 
| 696 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 697 | 
         
            -
                  "num_tokens":  
     | 
| 698 | 
         
             
                  "step": 1800
         
     | 
| 699 | 
         
             
                },
         
     | 
| 700 | 
         
             
                {
         
     | 
| 701 | 
         
             
                  "epoch": 4.3983101991551,
         
     | 
| 702 | 
         
            -
                  "grad_norm": 0. 
     | 
| 703 | 
         
            -
                  "learning_rate":  
     | 
| 704 | 
         
            -
                  "loss": 0. 
     | 
| 705 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 706 | 
         
            -
                  "num_tokens":  
     | 
| 707 | 
         
             
                  "step": 1825
         
     | 
| 708 | 
         
             
                },
         
     | 
| 709 | 
         
             
                {
         
     | 
| 710 | 
         
             
                  "epoch": 4.458660229330115,
         
     | 
| 711 | 
         
            -
                  "grad_norm": 0. 
     | 
| 712 | 
         
            -
                  "learning_rate": 9. 
     | 
| 713 | 
         
            -
                  "loss": 0. 
     | 
| 714 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 715 | 
         
            -
                  "num_tokens":  
     | 
| 716 | 
         
             
                  "step": 1850
         
     | 
| 717 | 
         
             
                },
         
     | 
| 718 | 
         
             
                {
         
     | 
| 719 | 
         
             
                  "epoch": 4.51901025950513,
         
     | 
| 720 | 
         
            -
                  "grad_norm": 0. 
     | 
| 721 | 
         
            -
                  "learning_rate":  
     | 
| 722 | 
         
            -
                  "loss": 0. 
     | 
| 723 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 724 | 
         
            -
                  "num_tokens":  
     | 
| 725 | 
         
             
                  "step": 1875
         
     | 
| 726 | 
         
             
                },
         
     | 
| 727 | 
         
             
                {
         
     | 
| 728 | 
         
             
                  "epoch": 4.579360289680145,
         
     | 
| 729 | 
         
            -
                  "grad_norm": 0. 
     | 
| 730 | 
         
            -
                  "learning_rate":  
     | 
| 731 | 
         
            -
                  "loss": 0. 
     | 
| 732 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 733 | 
         
            -
                  "num_tokens":  
     | 
| 734 | 
         
             
                  "step": 1900
         
     | 
| 735 | 
         
             
                },
         
     | 
| 736 | 
         
             
                {
         
     | 
| 737 | 
         
             
                  "epoch": 4.63971031985516,
         
     | 
| 738 | 
         
            -
                  "grad_norm": 0. 
     | 
| 739 | 
         
            -
                  "learning_rate":  
     | 
| 740 | 
         
            -
                  "loss": 0. 
     | 
| 741 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 742 | 
         
            -
                  "num_tokens":  
     | 
| 743 | 
         
             
                  "step": 1925
         
     | 
| 744 | 
         
             
                },
         
     | 
| 745 | 
         
             
                {
         
     | 
| 746 | 
         
             
                  "epoch": 4.700060350030175,
         
     | 
| 747 | 
         
            -
                  "grad_norm": 0. 
     | 
| 748 | 
         
            -
                  "learning_rate":  
     | 
| 749 | 
         
            -
                  "loss": 0. 
     | 
| 750 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 751 | 
         
            -
                  "num_tokens":  
     | 
| 752 | 
         
             
                  "step": 1950
         
     | 
| 753 | 
         
             
                },
         
     | 
| 754 | 
         
             
                {
         
     | 
| 755 | 
         
             
                  "epoch": 4.76041038020519,
         
     | 
| 756 | 
         
            -
                  "grad_norm": 0. 
     | 
| 757 | 
         
            -
                  "learning_rate":  
     | 
| 758 | 
         
            -
                  "loss": 0. 
     | 
| 759 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 760 | 
         
            -
                  "num_tokens":  
     | 
| 761 | 
         
             
                  "step": 1975
         
     | 
| 762 | 
         
             
                },
         
     | 
| 763 | 
         
             
                {
         
     | 
| 764 | 
         
             
                  "epoch": 4.820760410380205,
         
     | 
| 765 | 
         
            -
                  "grad_norm": 0. 
     | 
| 766 | 
         
            -
                  "learning_rate":  
     | 
| 767 | 
         
            -
                  "loss": 0. 
     | 
| 768 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 769 | 
         
            -
                  "num_tokens":  
     | 
| 770 | 
         
             
                  "step": 2000
         
     | 
| 771 | 
         
             
                },
         
     | 
| 772 | 
         
             
                {
         
     | 
| 773 | 
         
             
                  "epoch": 4.88111044055522,
         
     | 
| 774 | 
         
            -
                  "grad_norm": 0. 
     | 
| 775 | 
         
            -
                  "learning_rate":  
     | 
| 776 | 
         
            -
                  "loss": 0. 
     | 
| 777 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 778 | 
         
            -
                  "num_tokens":  
     | 
| 779 | 
         
             
                  "step": 2025
         
     | 
| 780 | 
         
             
                },
         
     | 
| 781 | 
         
             
                {
         
     | 
| 782 | 
         
             
                  "epoch": 4.941460470730235,
         
     | 
| 783 | 
         
            -
                  "grad_norm": 0. 
     | 
| 784 | 
         
            -
                  "learning_rate":  
     | 
| 785 | 
         
            -
                  "loss": 0. 
     | 
| 786 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 787 | 
         
            -
                  "num_tokens":  
     | 
| 788 | 
         
             
                  "step": 2050
         
     | 
| 789 | 
         
             
                },
         
     | 
| 790 | 
         
             
                {
         
     | 
| 791 | 
         
             
                  "epoch": 5.0,
         
     | 
| 792 | 
         
            -
                  "grad_norm": 0. 
     | 
| 793 | 
         
            -
                  "learning_rate":  
     | 
| 794 | 
         
            -
                  "loss": 0. 
     | 
| 795 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 796 | 
         
             
                  "num_tokens": 11117565.0,
         
     | 
| 797 | 
         
             
                  "step": 2075
         
     | 
| 798 | 
         
             
                },
         
     | 
| 799 | 
         
             
                {
         
     | 
| 800 | 
         
             
                  "epoch": 5.0,
         
     | 
| 801 | 
         
            -
                  "eval_loss": 0. 
     | 
| 802 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 803 | 
         
             
                  "eval_num_tokens": 11117565.0,
         
     | 
| 804 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 805 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 806 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 807 | 
         
             
                  "step": 2075
         
     | 
| 808 | 
         
             
                }
         
     | 
| 809 | 
         
             
              ],
         
     | 
| 810 | 
         
             
              "logging_steps": 25,
         
     | 
| 811 | 
         
            -
              "max_steps":  
     | 
| 812 | 
         
             
              "num_input_tokens_seen": 0,
         
     | 
| 813 | 
         
            -
              "num_train_epochs":  
     | 
| 814 | 
         
             
              "save_steps": 500,
         
     | 
| 815 | 
         
             
              "stateful_callbacks": {
         
     | 
| 816 | 
         
             
                "TrainerControl": {
         
     | 
| 
         @@ -819,12 +819,12 @@ 
     | 
|
| 819 | 
         
             
                    "should_evaluate": false,
         
     | 
| 820 | 
         
             
                    "should_log": false,
         
     | 
| 821 | 
         
             
                    "should_save": true,
         
     | 
| 822 | 
         
            -
                    "should_training_stop":  
     | 
| 823 | 
         
             
                  },
         
     | 
| 824 | 
         
             
                  "attributes": {}
         
     | 
| 825 | 
         
             
                }
         
     | 
| 826 | 
         
             
              },
         
     | 
| 827 | 
         
            -
              "total_flos": 4. 
     | 
| 828 | 
         
             
              "train_batch_size": 2,
         
     | 
| 829 | 
         
             
              "trial_name": null,
         
     | 
| 830 | 
         
             
              "trial_params": null
         
     | 
| 
         | 
|
| 11 | 
         
             
              "log_history": [
         
     | 
| 12 | 
         
             
                {
         
     | 
| 13 | 
         
             
                  "epoch": 0.060350030175015085,
         
     | 
| 14 | 
         
            +
                  "grad_norm": 0.7244853377342224,
         
     | 
| 15 | 
         
            +
                  "learning_rate": 0.00011428571428571427,
         
     | 
| 16 | 
         
            +
                  "loss": 1.5091,
         
     | 
| 17 | 
         
            +
                  "mean_token_accuracy": 0.6793113535642624,
         
     | 
| 18 | 
         
            +
                  "num_tokens": 152165.0,
         
     | 
| 19 | 
         
             
                  "step": 25
         
     | 
| 20 | 
         
             
                },
         
     | 
| 21 | 
         
             
                {
         
     | 
| 22 | 
         
             
                  "epoch": 0.12070006035003017,
         
     | 
| 23 | 
         
            +
                  "grad_norm": 0.8389242887496948,
         
     | 
| 24 | 
         
            +
                  "learning_rate": 0.0002333333333333333,
         
     | 
| 25 | 
         
            +
                  "loss": 0.8436,
         
     | 
| 26 | 
         
            +
                  "mean_token_accuracy": 0.7881802421808243,
         
     | 
| 27 | 
         
            +
                  "num_tokens": 267390.0,
         
     | 
| 28 | 
         
             
                  "step": 50
         
     | 
| 29 | 
         
             
                },
         
     | 
| 30 | 
         
             
                {
         
     | 
| 31 | 
         
             
                  "epoch": 0.18105009052504525,
         
     | 
| 32 | 
         
            +
                  "grad_norm": 0.6344988942146301,
         
     | 
| 33 | 
         
            +
                  "learning_rate": 0.00029997787517981614,
         
     | 
| 34 | 
         
            +
                  "loss": 0.5527,
         
     | 
| 35 | 
         
            +
                  "mean_token_accuracy": 0.8469069242477417,
         
     | 
| 36 | 
         
            +
                  "num_tokens": 420975.0,
         
     | 
| 37 | 
         
             
                  "step": 75
         
     | 
| 38 | 
         
             
                },
         
     | 
| 39 | 
         
             
                {
         
     | 
| 40 | 
         
             
                  "epoch": 0.24140012070006034,
         
     | 
| 41 | 
         
            +
                  "grad_norm": 0.7947192192077637,
         
     | 
| 42 | 
         
            +
                  "learning_rate": 0.0002997630832860032,
         
     | 
| 43 | 
         
            +
                  "loss": 0.4522,
         
     | 
| 44 | 
         
            +
                  "mean_token_accuracy": 0.870941441655159,
         
     | 
| 45 | 
         
            +
                  "num_tokens": 538380.0,
         
     | 
| 46 | 
         
             
                  "step": 100
         
     | 
| 47 | 
         
             
                },
         
     | 
| 48 | 
         
             
                {
         
     | 
| 49 | 
         
             
                  "epoch": 0.30175015087507545,
         
     | 
| 50 | 
         
            +
                  "grad_norm": 0.43716728687286377,
         
     | 
| 51 | 
         
            +
                  "learning_rate": 0.0002993201135681549,
         
     | 
| 52 | 
         
            +
                  "loss": 0.3049,
         
     | 
| 53 | 
         
            +
                  "mean_token_accuracy": 0.9136220461130142,
         
     | 
| 54 | 
         
            +
                  "num_tokens": 690650.0,
         
     | 
| 55 | 
         
             
                  "step": 125
         
     | 
| 56 | 
         
             
                },
         
     | 
| 57 | 
         
             
                {
         
     | 
| 58 | 
         
             
                  "epoch": 0.3621001810500905,
         
     | 
| 59 | 
         
            +
                  "grad_norm": 1.09097421169281,
         
     | 
| 60 | 
         
            +
                  "learning_rate": 0.0002986496409313553,
         
     | 
| 61 | 
         
            +
                  "loss": 0.3172,
         
     | 
| 62 | 
         
            +
                  "mean_token_accuracy": 0.91127048432827,
         
     | 
| 63 | 
         
            +
                  "num_tokens": 806066.0,
         
     | 
| 64 | 
         
             
                  "step": 150
         
     | 
| 65 | 
         
             
                },
         
     | 
| 66 | 
         
             
                {
         
     | 
| 67 | 
         
             
                  "epoch": 0.4224502112251056,
         
     | 
| 68 | 
         
            +
                  "grad_norm": 0.3773705065250397,
         
     | 
| 69 | 
         
            +
                  "learning_rate": 0.0002977526869022985,
         
     | 
| 70 | 
         
            +
                  "loss": 0.2029,
         
     | 
| 71 | 
         
            +
                  "mean_token_accuracy": 0.9433162885904313,
         
     | 
| 72 | 
         
            +
                  "num_tokens": 960853.0,
         
     | 
| 73 | 
         
             
                  "step": 175
         
     | 
| 74 | 
         
             
                },
         
     | 
| 75 | 
         
             
                {
         
     | 
| 76 | 
         
             
                  "epoch": 0.4828002414001207,
         
     | 
| 77 | 
         
            +
                  "grad_norm": 0.8292771577835083,
         
     | 
| 78 | 
         
            +
                  "learning_rate": 0.0002966306180728982,
         
     | 
| 79 | 
         
            +
                  "loss": 0.2274,
         
     | 
| 80 | 
         
            +
                  "mean_token_accuracy": 0.9385988712310791,
         
     | 
| 81 | 
         
            +
                  "num_tokens": 1077726.0,
         
     | 
| 82 | 
         
             
                  "step": 200
         
     | 
| 83 | 
         
             
                },
         
     | 
| 84 | 
         
             
                {
         
     | 
| 85 | 
         
             
                  "epoch": 0.5431502715751357,
         
     | 
| 86 | 
         
            +
                  "grad_norm": 0.4765889346599579,
         
     | 
| 87 | 
         
            +
                  "learning_rate": 0.0002952851440181598,
         
     | 
| 88 | 
         
            +
                  "loss": 0.19,
         
     | 
| 89 | 
         
            +
                  "mean_token_accuracy": 0.9479016721248626,
         
     | 
| 90 | 
         
            +
                  "num_tokens": 1232263.0,
         
     | 
| 91 | 
         
             
                  "step": 225
         
     | 
| 92 | 
         
             
                },
         
     | 
| 93 | 
         
             
                {
         
     | 
| 94 | 
         
             
                  "epoch": 0.6035003017501509,
         
     | 
| 95 | 
         
            +
                  "grad_norm": 0.9254749417304993,
         
     | 
| 96 | 
         
            +
                  "learning_rate": 0.0002937183146914856,
         
     | 
| 97 | 
         
            +
                  "loss": 0.1826,
         
     | 
| 98 | 
         
            +
                  "mean_token_accuracy": 0.9498224484920502,
         
     | 
| 99 | 
         
            +
                  "num_tokens": 1349057.0,
         
     | 
| 100 | 
         
             
                  "step": 250
         
     | 
| 101 | 
         
             
                },
         
     | 
| 102 | 
         
             
                {
         
     | 
| 103 | 
         
             
                  "epoch": 0.663850331925166,
         
     | 
| 104 | 
         
            +
                  "grad_norm": 0.4938018023967743,
         
     | 
| 105 | 
         
            +
                  "learning_rate": 0.000291932517301382,
         
     | 
| 106 | 
         
            +
                  "loss": 0.1497,
         
     | 
| 107 | 
         
            +
                  "mean_token_accuracy": 0.9588899296522141,
         
     | 
| 108 | 
         
            +
                  "num_tokens": 1496867.0,
         
     | 
| 109 | 
         
             
                  "step": 275
         
     | 
| 110 | 
         
             
                },
         
     | 
| 111 | 
         
             
                {
         
     | 
| 112 | 
         
             
                  "epoch": 0.724200362100181,
         
     | 
| 113 | 
         
            +
                  "grad_norm": 0.6995358467102051,
         
     | 
| 114 | 
         
            +
                  "learning_rate": 0.00028993047267432864,
         
     | 
| 115 | 
         
            +
                  "loss": 0.1578,
         
     | 
| 116 | 
         
            +
                  "mean_token_accuracy": 0.9568761509656906,
         
     | 
| 117 | 
         
            +
                  "num_tokens": 1610727.0,
         
     | 
| 118 | 
         
             
                  "step": 300
         
     | 
| 119 | 
         
             
                },
         
     | 
| 120 | 
         
             
                {
         
     | 
| 121 | 
         
             
                  "epoch": 0.7845503922751962,
         
     | 
| 122 | 
         
            +
                  "grad_norm": 0.46799567341804504,
         
     | 
| 123 | 
         
            +
                  "learning_rate": 0.0002877152311093483,
         
     | 
| 124 | 
         
            +
                  "loss": 0.1351,
         
     | 
| 125 | 
         
            +
                  "mean_token_accuracy": 0.9633717983961105,
         
     | 
| 126 | 
         
            +
                  "num_tokens": 1762041.0,
         
     | 
| 127 | 
         
             
                  "step": 325
         
     | 
| 128 | 
         
             
                },
         
     | 
| 129 | 
         
             
                {
         
     | 
| 130 | 
         
             
                  "epoch": 0.8449004224502112,
         
     | 
| 131 | 
         
            +
                  "grad_norm": 0.6729409098625183,
         
     | 
| 132 | 
         
            +
                  "learning_rate": 0.00028529016773059656,
         
     | 
| 133 | 
         
            +
                  "loss": 0.1206,
         
     | 
| 134 | 
         
            +
                  "mean_token_accuracy": 0.9687577307224273,
         
     | 
| 135 | 
         
            +
                  "num_tokens": 1877965.0,
         
     | 
| 136 | 
         
             
                  "step": 350
         
     | 
| 137 | 
         
             
                },
         
     | 
| 138 | 
         
             
                {
         
     | 
| 139 | 
         
             
                  "epoch": 0.9052504526252263,
         
     | 
| 140 | 
         
            +
                  "grad_norm": 0.5820412635803223,
         
     | 
| 141 | 
         
            +
                  "learning_rate": 0.00028265897734504976,
         
     | 
| 142 | 
         
            +
                  "loss": 0.1183,
         
     | 
| 143 | 
         
            +
                  "mean_token_accuracy": 0.96822787463665,
         
     | 
| 144 | 
         
            +
                  "num_tokens": 2028343.0,
         
     | 
| 145 | 
         
             
                  "step": 375
         
     | 
| 146 | 
         
             
                },
         
     | 
| 147 | 
         
             
                {
         
     | 
| 148 | 
         
             
                  "epoch": 0.9656004828002414,
         
     | 
| 149 | 
         
            +
                  "grad_norm": 0.8604497909545898,
         
     | 
| 150 | 
         
            +
                  "learning_rate": 0.0002798256688131267,
         
     | 
| 151 | 
         
            +
                  "loss": 0.1159,
         
     | 
| 152 | 
         
            +
                  "mean_token_accuracy": 0.9700725018978119,
         
     | 
| 153 | 
         
            +
                  "num_tokens": 2145044.0,
         
     | 
| 154 | 
         
             
                  "step": 400
         
     | 
| 155 | 
         
             
                },
         
     | 
| 156 | 
         
             
                {
         
     | 
| 157 | 
         
             
                  "epoch": 1.0,
         
     | 
| 158 | 
         
            +
                  "eval_loss": 0.1169130727648735,
         
     | 
| 159 | 
         
            +
                  "eval_mean_token_accuracy": 0.9691641559471955,
         
     | 
| 160 | 
         
             
                  "eval_num_tokens": 2223513.0,
         
     | 
| 161 | 
         
            +
                  "eval_runtime": 60.5832,
         
     | 
| 162 | 
         
            +
                  "eval_samples_per_second": 6.091,
         
     | 
| 163 | 
         
            +
                  "eval_steps_per_second": 3.054,
         
     | 
| 164 | 
         
             
                  "step": 415
         
     | 
| 165 | 
         
             
                },
         
     | 
| 166 | 
         
             
                {
         
     | 
| 167 | 
         
             
                  "epoch": 1.024140012070006,
         
     | 
| 168 | 
         
            +
                  "grad_norm": 0.20096616446971893,
         
     | 
| 169 | 
         
            +
                  "learning_rate": 0.0002767945589408217,
         
     | 
| 170 | 
         
            +
                  "loss": 0.122,
         
     | 
| 171 | 
         
            +
                  "mean_token_accuracy": 0.9680000224064306,
         
     | 
| 172 | 
         
            +
                  "num_tokens": 2291746.0,
         
     | 
| 173 | 
         
             
                  "step": 425
         
     | 
| 174 | 
         
             
                },
         
     | 
| 175 | 
         
             
                {
         
     | 
| 176 | 
         
             
                  "epoch": 1.0844900422450212,
         
     | 
| 177 | 
         
            +
                  "grad_norm": 0.34665247797966003,
         
     | 
| 178 | 
         
            +
                  "learning_rate": 0.0002735702659026533,
         
     | 
| 179 | 
         
            +
                  "loss": 0.0836,
         
     | 
| 180 | 
         
            +
                  "mean_token_accuracy": 0.9780776232481003,
         
     | 
| 181 | 
         
            +
                  "num_tokens": 2424528.0,
         
     | 
| 182 | 
         
             
                  "step": 450
         
     | 
| 183 | 
         
             
                },
         
     | 
| 184 | 
         
             
                {
         
     | 
| 185 | 
         
             
                  "epoch": 1.1448400724200363,
         
     | 
| 186 | 
         
            +
                  "grad_norm": 0.30349963903427124,
         
     | 
| 187 | 
         
            +
                  "learning_rate": 0.0002701577022054515,
         
     | 
| 188 | 
         
            +
                  "loss": 0.1019,
         
     | 
| 189 | 
         
            +
                  "mean_token_accuracy": 0.9732917118072509,
         
     | 
| 190 | 
         
            +
                  "num_tokens": 2557091.0,
         
     | 
| 191 | 
         
             
                  "step": 475
         
     | 
| 192 | 
         
             
                },
         
     | 
| 193 | 
         
             
                {
         
     | 
| 194 | 
         
             
                  "epoch": 1.2051901025950513,
         
     | 
| 195 | 
         
            +
                  "grad_norm": 0.3892677426338196,
         
     | 
| 196 | 
         
            +
                  "learning_rate": 0.0002665620672037014,
         
     | 
| 197 | 
         
            +
                  "loss": 0.0831,
         
     | 
| 198 | 
         
            +
                  "mean_token_accuracy": 0.9782004028558731,
         
     | 
| 199 | 
         
            +
                  "num_tokens": 2691527.0,
         
     | 
| 200 | 
         
             
                  "step": 500
         
     | 
| 201 | 
         
             
                },
         
     | 
| 202 | 
         
             
                {
         
     | 
| 203 | 
         
             
                  "epoch": 1.2655401327700664,
         
     | 
| 204 | 
         
            +
                  "grad_norm": 0.29889699816703796,
         
     | 
| 205 | 
         
            +
                  "learning_rate": 0.0002627888391778493,
         
     | 
| 206 | 
         
            +
                  "loss": 0.1023,
         
     | 
| 207 | 
         
            +
                  "mean_token_accuracy": 0.9729781967401504,
         
     | 
| 208 | 
         
            +
                  "num_tokens": 2824699.0,
         
     | 
| 209 | 
         
             
                  "step": 525
         
     | 
| 210 | 
         
             
                },
         
     | 
| 211 | 
         
             
                {
         
     | 
| 212 | 
         
             
                  "epoch": 1.3258901629450814,
         
     | 
| 213 | 
         
            +
                  "grad_norm": 0.393573522567749,
         
     | 
| 214 | 
         
            +
                  "learning_rate": 0.0002588437669876384,
         
     | 
| 215 | 
         
            +
                  "loss": 0.0779,
         
     | 
| 216 | 
         
            +
                  "mean_token_accuracy": 0.9795191860198975,
         
     | 
| 217 | 
         
            +
                  "num_tokens": 2958826.0,
         
     | 
| 218 | 
         
             
                  "step": 550
         
     | 
| 219 | 
         
             
                },
         
     | 
| 220 | 
         
             
                {
         
     | 
| 221 | 
         
             
                  "epoch": 1.3862401931200965,
         
     | 
| 222 | 
         
            +
                  "grad_norm": 0.26299118995666504,
         
     | 
| 223 | 
         
            +
                  "learning_rate": 0.00025473286131319283,
         
     | 
| 224 | 
         
            +
                  "loss": 0.0988,
         
     | 
| 225 | 
         
            +
                  "mean_token_accuracy": 0.9739746767282486,
         
     | 
| 226 | 
         
            +
                  "num_tokens": 3092320.0,
         
     | 
| 227 | 
         
             
                  "step": 575
         
     | 
| 228 | 
         
             
                },
         
     | 
| 229 | 
         
             
                {
         
     | 
| 230 | 
         
             
                  "epoch": 1.4465902232951118,
         
     | 
| 231 | 
         
            +
                  "grad_norm": 0.3649594783782959,
         
     | 
| 232 | 
         
            +
                  "learning_rate": 0.0002504623854971937,
         
     | 
| 233 | 
         
            +
                  "loss": 0.0729,
         
     | 
| 234 | 
         
            +
                  "mean_token_accuracy": 0.9814109367132187,
         
     | 
| 235 | 
         
            +
                  "num_tokens": 3227452.0,
         
     | 
| 236 | 
         
             
                  "step": 600
         
     | 
| 237 | 
         
             
                },
         
     | 
| 238 | 
         
             
                {
         
     | 
| 239 | 
         
             
                  "epoch": 1.5069402534701268,
         
     | 
| 240 | 
         
            +
                  "grad_norm": 0.28632357716560364,
         
     | 
| 241 | 
         
            +
                  "learning_rate": 0.00024603884600210097,
         
     | 
| 242 | 
         
            +
                  "loss": 0.0957,
         
     | 
| 243 | 
         
            +
                  "mean_token_accuracy": 0.9748889011144638,
         
     | 
| 244 | 
         
            +
                  "num_tokens": 3361210.0,
         
     | 
| 245 | 
         
             
                  "step": 625
         
     | 
| 246 | 
         
             
                },
         
     | 
| 247 | 
         
             
                {
         
     | 
| 248 | 
         
             
                  "epoch": 1.567290283645142,
         
     | 
| 249 | 
         
            +
                  "grad_norm": 0.25492990016937256,
         
     | 
| 250 | 
         
            +
                  "learning_rate": 0.00024146898249695974,
         
     | 
| 251 | 
         
            +
                  "loss": 0.075,
         
     | 
| 252 | 
         
            +
                  "mean_token_accuracy": 0.9806595808267593,
         
     | 
| 253 | 
         
            +
                  "num_tokens": 3497177.0,
         
     | 
| 254 | 
         
             
                  "step": 650
         
     | 
| 255 | 
         
             
                },
         
     | 
| 256 | 
         
             
                {
         
     | 
| 257 | 
         
             
                  "epoch": 1.627640313820157,
         
     | 
| 258 | 
         
            +
                  "grad_norm": 0.37043872475624084,
         
     | 
| 259 | 
         
            +
                  "learning_rate": 0.00023675975758889506,
         
     | 
| 260 | 
         
            +
                  "loss": 0.0918,
         
     | 
| 261 | 
         
            +
                  "mean_token_accuracy": 0.9762868732213974,
         
     | 
| 262 | 
         
            +
                  "num_tokens": 3630834.0,
         
     | 
| 263 | 
         
             
                  "step": 675
         
     | 
| 264 | 
         
             
                },
         
     | 
| 265 | 
         
             
                {
         
     | 
| 266 | 
         
             
                  "epoch": 1.687990343995172,
         
     | 
| 267 | 
         
            +
                  "grad_norm": 0.26372411847114563,
         
     | 
| 268 | 
         
            +
                  "learning_rate": 0.00023191834621493968,
         
     | 
| 269 | 
         
            +
                  "loss": 0.0674,
         
     | 
| 270 | 
         
            +
                  "mean_token_accuracy": 0.9826526433229447,
         
     | 
| 271 | 
         
            +
                  "num_tokens": 3766598.0,
         
     | 
| 272 | 
         
             
                  "step": 700
         
     | 
| 273 | 
         
             
                },
         
     | 
| 274 | 
         
             
                {
         
     | 
| 275 | 
         
             
                  "epoch": 1.748340374170187,
         
     | 
| 276 | 
         
            +
                  "grad_norm": 0.2400335669517517,
         
     | 
| 277 | 
         
            +
                  "learning_rate": 0.00022695212471035816,
         
     | 
| 278 | 
         
            +
                  "loss": 0.0807,
         
     | 
| 279 | 
         
            +
                  "mean_token_accuracy": 0.9793906199932099,
         
     | 
| 280 | 
         
            +
                  "num_tokens": 3899644.0,
         
     | 
| 281 | 
         
             
                  "step": 725
         
     | 
| 282 | 
         
             
                },
         
     | 
| 283 | 
         
             
                {
         
     | 
| 284 | 
         
             
                  "epoch": 1.8086904043452021,
         
     | 
| 285 | 
         
            +
                  "grad_norm": 0.19833268225193024,
         
     | 
| 286 | 
         
            +
                  "learning_rate": 0.0002218686595701219,
         
     | 
| 287 | 
         
            +
                  "loss": 0.0655,
         
     | 
| 288 | 
         
            +
                  "mean_token_accuracy": 0.9832920217514038,
         
     | 
| 289 | 
         
            +
                  "num_tokens": 4036037.0,
         
     | 
| 290 | 
         
             
                  "step": 750
         
     | 
| 291 | 
         
             
                },
         
     | 
| 292 | 
         
             
                {
         
     | 
| 293 | 
         
             
                  "epoch": 1.8690404345202172,
         
     | 
| 294 | 
         
            +
                  "grad_norm": 0.17969554662704468,
         
     | 
| 295 | 
         
            +
                  "learning_rate": 0.0002166756959206587,
         
     | 
| 296 | 
         
            +
                  "loss": 0.0831,
         
     | 
| 297 | 
         
            +
                  "mean_token_accuracy": 0.9791438663005829,
         
     | 
| 298 | 
         
            +
                  "num_tokens": 4168035.0,
         
     | 
| 299 | 
         
             
                  "step": 775
         
     | 
| 300 | 
         
             
                },
         
     | 
| 301 | 
         
             
                {
         
     | 
| 302 | 
         
             
                  "epoch": 1.9293904646952322,
         
     | 
| 303 | 
         
            +
                  "grad_norm": 0.3069966733455658,
         
     | 
| 304 | 
         
            +
                  "learning_rate": 0.00021138114571944054,
         
     | 
| 305 | 
         
            +
                  "loss": 0.0624,
         
     | 
| 306 | 
         
            +
                  "mean_token_accuracy": 0.9839604765176773,
         
     | 
| 307 | 
         
            +
                  "num_tokens": 4302324.0,
         
     | 
| 308 | 
         
             
                  "step": 800
         
     | 
| 309 | 
         
             
                },
         
     | 
| 310 | 
         
             
                {
         
     | 
| 311 | 
         
             
                  "epoch": 1.9897404948702473,
         
     | 
| 312 | 
         
            +
                  "grad_norm": 0.26080530881881714,
         
     | 
| 313 | 
         
            +
                  "learning_rate": 0.000205993075700389,
         
     | 
| 314 | 
         
            +
                  "loss": 0.0728,
         
     | 
| 315 | 
         
            +
                  "mean_token_accuracy": 0.9816776049137116,
         
     | 
| 316 | 
         
            +
                  "num_tokens": 4428521.0,
         
     | 
| 317 | 
         
             
                  "step": 825
         
     | 
| 318 | 
         
             
                },
         
     | 
| 319 | 
         
             
                {
         
     | 
| 320 | 
         
             
                  "epoch": 2.0,
         
     | 
| 321 | 
         
            +
                  "eval_loss": 0.07739538699388504,
         
     | 
| 322 | 
         
            +
                  "eval_mean_token_accuracy": 0.9806474750106399,
         
     | 
| 323 | 
         
             
                  "eval_num_tokens": 4447026.0,
         
     | 
| 324 | 
         
            +
                  "eval_runtime": 60.6735,
         
     | 
| 325 | 
         
            +
                  "eval_samples_per_second": 6.082,
         
     | 
| 326 | 
         
            +
                  "eval_steps_per_second": 3.049,
         
     | 
| 327 | 
         
             
                  "step": 830
         
     | 
| 328 | 
         
             
                },
         
     | 
| 329 | 
         
             
                {
         
     | 
| 330 | 
         
             
                  "epoch": 2.048280024140012,
         
     | 
| 331 | 
         
            +
                  "grad_norm": 0.32912909984588623,
         
     | 
| 332 | 
         
            +
                  "learning_rate": 0.00020051969508346498,
         
     | 
| 333 | 
         
            +
                  "loss": 0.0624,
         
     | 
| 334 | 
         
            +
                  "mean_token_accuracy": 0.98369190680612,
         
     | 
| 335 | 
         
            +
                  "num_tokens": 4571335.0,
         
     | 
| 336 | 
         
             
                  "step": 850
         
     | 
| 337 | 
         
             
                },
         
     | 
| 338 | 
         
             
                {
         
     | 
| 339 | 
         
             
                  "epoch": 2.1086300543150274,
         
     | 
| 340 | 
         
            +
                  "grad_norm": 0.22884123027324677,
         
     | 
| 341 | 
         
            +
                  "learning_rate": 0.00019496934306716706,
         
     | 
| 342 | 
         
            +
                  "loss": 0.0543,
         
     | 
| 343 | 
         
            +
                  "mean_token_accuracy": 0.9862597143650055,
         
     | 
| 344 | 
         
            +
                  "num_tokens": 4694373.0,
         
     | 
| 345 | 
         
             
                  "step": 875
         
     | 
| 346 | 
         
             
                },
         
     | 
| 347 | 
         
             
                {
         
     | 
| 348 | 
         
             
                  "epoch": 2.1689800844900424,
         
     | 
| 349 | 
         
            +
                  "grad_norm": 0.15646718442440033,
         
     | 
| 350 | 
         
            +
                  "learning_rate": 0.00018935047612299625,
         
     | 
| 351 | 
         
            +
                  "loss": 0.0683,
         
     | 
| 352 | 
         
            +
                  "mean_token_accuracy": 0.9817469125986099,
         
     | 
| 353 | 
         
            +
                  "num_tokens": 4840032.0,
         
     | 
| 354 | 
         
             
                  "step": 900
         
     | 
| 355 | 
         
             
                },
         
     | 
| 356 | 
         
             
                {
         
     | 
| 357 | 
         
             
                  "epoch": 2.2293301146650575,
         
     | 
| 358 | 
         
            +
                  "grad_norm": 0.32684165239334106,
         
     | 
| 359 | 
         
            +
                  "learning_rate": 0.00018367165511124414,
         
     | 
| 360 | 
         
            +
                  "loss": 0.0558,
         
     | 
| 361 | 
         
            +
                  "mean_token_accuracy": 0.9862085193395614,
         
     | 
| 362 | 
         
            +
                  "num_tokens": 4962900.0,
         
     | 
| 363 | 
         
             
                  "step": 925
         
     | 
| 364 | 
         
             
                },
         
     | 
| 365 | 
         
             
                {
         
     | 
| 366 | 
         
             
                  "epoch": 2.2896801448400725,
         
     | 
| 367 | 
         
            +
                  "grad_norm": 0.15353620052337646,
         
     | 
| 368 | 
         
            +
                  "learning_rate": 0.00017794153223773558,
         
     | 
| 369 | 
         
            +
                  "loss": 0.0649,
         
     | 
| 370 | 
         
            +
                  "mean_token_accuracy": 0.9830775827169418,
         
     | 
| 371 | 
         
            +
                  "num_tokens": 5107775.0,
         
     | 
| 372 | 
         
             
                  "step": 950
         
     | 
| 373 | 
         
             
                },
         
     | 
| 374 | 
         
             
                {
         
     | 
| 375 | 
         
             
                  "epoch": 2.3500301750150876,
         
     | 
| 376 | 
         
            +
                  "grad_norm": 0.13864906132221222,
         
     | 
| 377 | 
         
            +
                  "learning_rate": 0.00017216883787139772,
         
     | 
| 378 | 
         
            +
                  "loss": 0.0513,
         
     | 
| 379 | 
         
            +
                  "mean_token_accuracy": 0.9871918082237243,
         
     | 
| 380 | 
         
            +
                  "num_tokens": 5231159.0,
         
     | 
| 381 | 
         
             
                  "step": 975
         
     | 
| 382 | 
         
             
                },
         
     | 
| 383 | 
         
             
                {
         
     | 
| 384 | 
         
             
                  "epoch": 2.4103802051901027,
         
     | 
| 385 | 
         
            +
                  "grad_norm": 0.18856066465377808,
         
     | 
| 386 | 
         
            +
                  "learning_rate": 0.00016636236724274,
         
     | 
| 387 | 
         
            +
                  "loss": 0.0653,
         
     | 
| 388 | 
         
            +
                  "mean_token_accuracy": 0.9824860644340515,
         
     | 
| 389 | 
         
            +
                  "num_tokens": 5375658.0,
         
     | 
| 390 | 
         
             
                  "step": 1000
         
     | 
| 391 | 
         
             
                },
         
     | 
| 392 | 
         
             
                {
         
     | 
| 393 | 
         
             
                  "epoch": 2.4707302353651177,
         
     | 
| 394 | 
         
            +
                  "grad_norm": 0.1747666597366333,
         
     | 
| 395 | 
         
            +
                  "learning_rate": 0.00016053096704351255,
         
     | 
| 396 | 
         
            +
                  "loss": 0.0536,
         
     | 
| 397 | 
         
            +
                  "mean_token_accuracy": 0.9870379114151001,
         
     | 
| 398 | 
         
            +
                  "num_tokens": 5498792.0,
         
     | 
| 399 | 
         
             
                  "step": 1025
         
     | 
| 400 | 
         
             
                },
         
     | 
| 401 | 
         
             
                {
         
     | 
| 402 | 
         
             
                  "epoch": 2.5310802655401328,
         
     | 
| 403 | 
         
            +
                  "grad_norm": 0.08616527169942856,
         
     | 
| 404 | 
         
            +
                  "learning_rate": 0.00015468352194795791,
         
     | 
| 405 | 
         
            +
                  "loss": 0.0605,
         
     | 
| 406 | 
         
            +
                  "mean_token_accuracy": 0.9837486296892166,
         
     | 
| 407 | 
         
            +
                  "num_tokens": 5644155.0,
         
     | 
| 408 | 
         
             
                  "step": 1050
         
     | 
| 409 | 
         
             
                },
         
     | 
| 410 | 
         
             
                {
         
     | 
| 411 | 
         
             
                  "epoch": 2.591430295715148,
         
     | 
| 412 | 
         
            +
                  "grad_norm": 0.21047131717205048,
         
     | 
| 413 | 
         
            +
                  "learning_rate": 0.00014882894107619277,
         
     | 
| 414 | 
         
            +
                  "loss": 0.0502,
         
     | 
| 415 | 
         
            +
                  "mean_token_accuracy": 0.9874639976024627,
         
     | 
| 416 | 
         
            +
                  "num_tokens": 5768255.0,
         
     | 
| 417 | 
         
             
                  "step": 1075
         
     | 
| 418 | 
         
             
                },
         
     | 
| 419 | 
         
             
                {
         
     | 
| 420 | 
         
             
                  "epoch": 2.651780325890163,
         
     | 
| 421 | 
         
            +
                  "grad_norm": 0.09520892798900604,
         
     | 
| 422 | 
         
            +
                  "learning_rate": 0.00014297614442034518,
         
     | 
| 423 | 
         
            +
                  "loss": 0.0568,
         
     | 
| 424 | 
         
            +
                  "mean_token_accuracy": 0.9851021945476532,
         
     | 
| 425 | 
         
            +
                  "num_tokens": 5913228.0,
         
     | 
| 426 | 
         
             
                  "step": 1100
         
     | 
| 427 | 
         
             
                },
         
     | 
| 428 | 
         
             
                {
         
     | 
| 429 | 
         
             
                  "epoch": 2.712130356065178,
         
     | 
| 430 | 
         
            +
                  "grad_norm": 0.11644323915243149,
         
     | 
| 431 | 
         
            +
                  "learning_rate": 0.000137134049254126,
         
     | 
| 432 | 
         
            +
                  "loss": 0.0523,
         
     | 
| 433 | 
         
            +
                  "mean_token_accuracy": 0.9867914581298828,
         
     | 
| 434 | 
         
            +
                  "num_tokens": 6037285.0,
         
     | 
| 435 | 
         
             
                  "step": 1125
         
     | 
| 436 | 
         
             
                },
         
     | 
| 437 | 
         
             
                {
         
     | 
| 438 | 
         
             
                  "epoch": 2.772480386240193,
         
     | 
| 439 | 
         
            +
                  "grad_norm": 0.12872624397277832,
         
     | 
| 440 | 
         
            +
                  "learning_rate": 0.000131311556546543,
         
     | 
| 441 | 
         
            +
                  "loss": 0.0563,
         
     | 
| 442 | 
         
            +
                  "mean_token_accuracy": 0.9849929654598236,
         
     | 
| 443 | 
         
            +
                  "num_tokens": 6183361.0,
         
     | 
| 444 | 
         
             
                  "step": 1150
         
     | 
| 445 | 
         
             
                },
         
     | 
| 446 | 
         
             
                {
         
     | 
| 447 | 
         
             
                  "epoch": 2.832830416415208,
         
     | 
| 448 | 
         
            +
                  "grad_norm": 0.10195529460906982,
         
     | 
| 449 | 
         
            +
                  "learning_rate": 0.0001255175374004563,
         
     | 
| 450 | 
         
            +
                  "loss": 0.0501,
         
     | 
| 451 | 
         
            +
                  "mean_token_accuracy": 0.9871714848279953,
         
     | 
| 452 | 
         
            +
                  "num_tokens": 6305713.0,
         
     | 
| 453 | 
         
             
                  "step": 1175
         
     | 
| 454 | 
         
             
                },
         
     | 
| 455 | 
         
             
                {
         
     | 
| 456 | 
         
             
                  "epoch": 2.8931804465902236,
         
     | 
| 457 | 
         
            +
                  "grad_norm": 0.09452041983604431,
         
     | 
| 458 | 
         
            +
                  "learning_rate": 0.0001197608195366377,
         
     | 
| 459 | 
         
            +
                  "loss": 0.0581,
         
     | 
| 460 | 
         
            +
                  "mean_token_accuracy": 0.9840293884277344,
         
     | 
| 461 | 
         
            +
                  "num_tokens": 6451719.0,
         
     | 
| 462 | 
         
             
                  "step": 1200
         
     | 
| 463 | 
         
             
                },
         
     | 
| 464 | 
         
             
                {
         
     | 
| 465 | 
         
             
                  "epoch": 2.9535304767652386,
         
     | 
| 466 | 
         
            +
                  "grad_norm": 0.17165224254131317,
         
     | 
| 467 | 
         
            +
                  "learning_rate": 0.00011405017384392655,
         
     | 
| 468 | 
         
            +
                  "loss": 0.049,
         
     | 
| 469 | 
         
            +
                  "mean_token_accuracy": 0.9875269651412963,
         
     | 
| 470 | 
         
            +
                  "num_tokens": 6575211.0,
         
     | 
| 471 | 
         
             
                  "step": 1225
         
     | 
| 472 | 
         
             
                },
         
     | 
| 473 | 
         
             
                {
         
     | 
| 474 | 
         
             
                  "epoch": 3.0,
         
     | 
| 475 | 
         
            +
                  "eval_loss": 0.06446010619401932,
         
     | 
| 476 | 
         
            +
                  "eval_mean_token_accuracy": 0.9841987928828677,
         
     | 
| 477 | 
         
             
                  "eval_num_tokens": 6670539.0,
         
     | 
| 478 | 
         
            +
                  "eval_runtime": 60.4296,
         
     | 
| 479 | 
         
            +
                  "eval_samples_per_second": 6.106,
         
     | 
| 480 | 
         
            +
                  "eval_steps_per_second": 3.061,
         
     | 
| 481 | 
         
             
                  "step": 1245
         
     | 
| 482 | 
         
             
                },
         
     | 
| 483 | 
         
             
                {
         
     | 
| 484 | 
         
             
                  "epoch": 3.012070006035003,
         
     | 
| 485 | 
         
            +
                  "grad_norm": 0.08178732544183731,
         
     | 
| 486 | 
         
            +
                  "learning_rate": 0.00010839430101597464,
         
     | 
| 487 | 
         
            +
                  "loss": 0.0535,
         
     | 
| 488 | 
         
            +
                  "mean_token_accuracy": 0.9864560107594913,
         
     | 
| 489 | 
         
            +
                  "num_tokens": 6706527.0,
         
     | 
| 490 | 
         
             
                  "step": 1250
         
     | 
| 491 | 
         
             
                },
         
     | 
| 492 | 
         
             
                {
         
     | 
| 493 | 
         
             
                  "epoch": 3.0724200362100182,
         
     | 
| 494 | 
         
            +
                  "grad_norm": 0.0654640942811966,
         
     | 
| 495 | 
         
            +
                  "learning_rate": 0.00010280181829493925,
         
     | 
| 496 | 
         
            +
                  "loss": 0.042,
         
     | 
| 497 | 
         
            +
                  "mean_token_accuracy": 0.9891558569669724,
         
     | 
| 498 | 
         
            +
                  "num_tokens": 6845866.0,
         
     | 
| 499 | 
         
             
                  "step": 1275
         
     | 
| 500 | 
         
             
                },
         
     | 
| 501 | 
         
             
                {
         
     | 
| 502 | 
         
             
                  "epoch": 3.1327700663850333,
         
     | 
| 503 | 
         
            +
                  "grad_norm": 0.13900737464427948,
         
     | 
| 504 | 
         
            +
                  "learning_rate": 9.728124634232282e-05,
         
     | 
| 505 | 
         
            +
                  "loss": 0.0496,
         
     | 
| 506 | 
         
            +
                  "mean_token_accuracy": 0.9874085110425949,
         
     | 
| 507 | 
         
            +
                  "num_tokens": 6972947.0,
         
     | 
| 508 | 
         
             
                  "step": 1300
         
     | 
| 509 | 
         
             
                },
         
     | 
| 510 | 
         
             
                {
         
     | 
| 511 | 
         
             
                  "epoch": 3.1931200965600484,
         
     | 
| 512 | 
         
            +
                  "grad_norm": 0.05376769229769707,
         
     | 
| 513 | 
         
            +
                  "learning_rate": 9.184099625696183e-05,
         
     | 
| 514 | 
         
            +
                  "loss": 0.0415,
         
     | 
| 515 | 
         
            +
                  "mean_token_accuracy": 0.9890899294614792,
         
     | 
| 516 | 
         
            +
                  "num_tokens": 7115975.0,
         
     | 
| 517 | 
         
             
                  "step": 1325
         
     | 
| 518 | 
         
             
                },
         
     | 
| 519 | 
         
             
                {
         
     | 
| 520 | 
         
             
                  "epoch": 3.2534701267350634,
         
     | 
| 521 | 
         
            +
                  "grad_norm": 0.12443197518587112,
         
     | 
| 522 | 
         
            +
                  "learning_rate": 8.648935675994459e-05,
         
     | 
| 523 | 
         
            +
                  "loss": 0.0484,
         
     | 
| 524 | 
         
            +
                  "mean_token_accuracy": 0.987565501332283,
         
     | 
| 525 | 
         
            +
                  "num_tokens": 7243324.0,
         
     | 
| 526 | 
         
             
                  "step": 1350
         
     | 
| 527 | 
         
             
                },
         
     | 
| 528 | 
         
             
                {
         
     | 
| 529 | 
         
             
                  "epoch": 3.3138201569100785,
         
     | 
| 530 | 
         
            +
                  "grad_norm": 0.0667525976896286,
         
     | 
| 531 | 
         
            +
                  "learning_rate": 8.123448156598283e-05,
         
     | 
| 532 | 
         
            +
                  "loss": 0.0415,
         
     | 
| 533 | 
         
            +
                  "mean_token_accuracy": 0.9890210199356079,
         
     | 
| 534 | 
         
            +
                  "num_tokens": 7385182.0,
         
     | 
| 535 | 
         
             
                  "step": 1375
         
     | 
| 536 | 
         
             
                },
         
     | 
| 537 | 
         
             
                {
         
     | 
| 538 | 
         
             
                  "epoch": 3.3741701870850935,
         
     | 
| 539 | 
         
            +
                  "grad_norm": 0.12773087620735168,
         
     | 
| 540 | 
         
            +
                  "learning_rate": 7.608437696047756e-05,
         
     | 
| 541 | 
         
            +
                  "loss": 0.0487,
         
     | 
| 542 | 
         
            +
                  "mean_token_accuracy": 0.9873174405097962,
         
     | 
| 543 | 
         
            +
                  "num_tokens": 7509648.0,
         
     | 
| 544 | 
         
             
                  "step": 1400
         
     | 
| 545 | 
         
             
                },
         
     | 
| 546 | 
         
             
                {
         
     | 
| 547 | 
         
             
                  "epoch": 3.4345202172601086,
         
     | 
| 548 | 
         
            +
                  "grad_norm": 0.07510969042778015,
         
     | 
| 549 | 
         
            +
                  "learning_rate": 7.104688960120769e-05,
         
     | 
| 550 | 
         
            +
                  "loss": 0.0403,
         
     | 
| 551 | 
         
            +
                  "mean_token_accuracy": 0.989400810599327,
         
     | 
| 552 | 
         
            +
                  "num_tokens": 7650532.0,
         
     | 
| 553 | 
         
             
                  "step": 1425
         
     | 
| 554 | 
         
             
                },
         
     | 
| 555 | 
         
             
                {
         
     | 
| 556 | 
         
             
                  "epoch": 3.4948702474351236,
         
     | 
| 557 | 
         
            +
                  "grad_norm": 0.24315868318080902,
         
     | 
| 558 | 
         
            +
                  "learning_rate": 6.612969456322507e-05,
         
     | 
| 559 | 
         
            +
                  "loss": 0.0493,
         
     | 
| 560 | 
         
            +
                  "mean_token_accuracy": 0.987003293633461,
         
     | 
| 561 | 
         
            +
                  "num_tokens": 7779847.0,
         
     | 
| 562 | 
         
             
                  "step": 1450
         
     | 
| 563 | 
         
             
                },
         
     | 
| 564 | 
         
             
                {
         
     | 
| 565 | 
         
             
                  "epoch": 3.5552202776101387,
         
     | 
| 566 | 
         
            +
                  "grad_norm": 0.0974864810705185,
         
     | 
| 567 | 
         
            +
                  "learning_rate": 6.134028364517273e-05,
         
     | 
| 568 | 
         
            +
                  "loss": 0.0405,
         
     | 
| 569 | 
         
            +
                  "mean_token_accuracy": 0.9892659622430802,
         
     | 
| 570 | 
         
            +
                  "num_tokens": 7922087.0,
         
     | 
| 571 | 
         
             
                  "step": 1475
         
     | 
| 572 | 
         
             
                },
         
     | 
| 573 | 
         
             
                {
         
     | 
| 574 | 
         
             
                  "epoch": 3.6155703077851538,
         
     | 
| 575 | 
         
            +
                  "grad_norm": 0.112852543592453,
         
     | 
| 576 | 
         
            +
                  "learning_rate": 5.6685953954840553e-05,
         
     | 
| 577 | 
         
            +
                  "loss": 0.0476,
         
     | 
| 578 | 
         
            +
                  "mean_token_accuracy": 0.9879545611143112,
         
     | 
| 579 | 
         
            +
                  "num_tokens": 8049661.0,
         
     | 
| 580 | 
         
             
                  "step": 1500
         
     | 
| 581 | 
         
             
                },
         
     | 
| 582 | 
         
             
                {
         
     | 
| 583 | 
         
             
                  "epoch": 3.675920337960169,
         
     | 
| 584 | 
         
            +
                  "grad_norm": 0.09587077796459198,
         
     | 
| 585 | 
         
            +
                  "learning_rate": 5.2173796791351116e-05,
         
     | 
| 586 | 
         
            +
                  "loss": 0.0399,
         
     | 
| 587 | 
         
            +
                  "mean_token_accuracy": 0.9899050652980804,
         
     | 
| 588 | 
         
            +
                  "num_tokens": 8191357.0,
         
     | 
| 589 | 
         
             
                  "step": 1525
         
     | 
| 590 | 
         
             
                },
         
     | 
| 591 | 
         
             
                {
         
     | 
| 592 | 
         
             
                  "epoch": 3.736270368135184,
         
     | 
| 593 | 
         
            +
                  "grad_norm": 0.15348604321479797,
         
     | 
| 594 | 
         
            +
                  "learning_rate": 4.781068684091327e-05,
         
     | 
| 595 | 
         
            +
                  "loss": 0.047,
         
     | 
| 596 | 
         
            +
                  "mean_token_accuracy": 0.9878348118066788,
         
     | 
| 597 | 
         
            +
                  "num_tokens": 8317709.0,
         
     | 
| 598 | 
         
             
                  "step": 1550
         
     | 
| 599 | 
         
             
                },
         
     | 
| 600 | 
         
             
                {
         
     | 
| 601 | 
         
             
                  "epoch": 3.796620398310199,
         
     | 
| 602 | 
         
            +
                  "grad_norm": 0.10841736942529678,
         
     | 
| 603 | 
         
            +
                  "learning_rate": 4.360327170260604e-05,
         
     | 
| 604 | 
         
            +
                  "loss": 0.0398,
         
     | 
| 605 | 
         
            +
                  "mean_token_accuracy": 0.9894819515943527,
         
     | 
| 606 | 
         
            +
                  "num_tokens": 8460448.0,
         
     | 
| 607 | 
         
             
                  "step": 1575
         
     | 
| 608 | 
         
             
                },
         
     | 
| 609 | 
         
             
                {
         
     | 
| 610 | 
         
             
                  "epoch": 3.856970428485214,
         
     | 
| 611 | 
         
            +
                  "grad_norm": 0.10409346967935562,
         
     | 
| 612 | 
         
            +
                  "learning_rate": 3.955796176015015e-05,
         
     | 
| 613 | 
         
            +
                  "loss": 0.0467,
         
     | 
| 614 | 
         
            +
                  "mean_token_accuracy": 0.9879930222034454,
         
     | 
| 615 | 
         
            +
                  "num_tokens": 8587426.0,
         
     | 
| 616 | 
         
             
                  "step": 1600
         
     | 
| 617 | 
         
             
                },
         
     | 
| 618 | 
         
             
                {
         
     | 
| 619 | 
         
             
                  "epoch": 3.9173204586602295,
         
     | 
| 620 | 
         
            +
                  "grad_norm": 0.08520140498876572,
         
     | 
| 621 | 
         
            +
                  "learning_rate": 3.5680920415099366e-05,
         
     | 
| 622 | 
         
            +
                  "loss": 0.0406,
         
     | 
| 623 | 
         
            +
                  "mean_token_accuracy": 0.9894054895639419,
         
     | 
| 624 | 
         
            +
                  "num_tokens": 8728471.0,
         
     | 
| 625 | 
         
             
                  "step": 1625
         
     | 
| 626 | 
         
             
                },
         
     | 
| 627 | 
         
             
                {
         
     | 
| 628 | 
         
             
                  "epoch": 3.9776704888352445,
         
     | 
| 629 | 
         
            +
                  "grad_norm": 0.10589835047721863,
         
     | 
| 630 | 
         
            +
                  "learning_rate": 3.197805469633152e-05,
         
     | 
| 631 | 
         
            +
                  "loss": 0.0458,
         
     | 
| 632 | 
         
            +
                  "mean_token_accuracy": 0.9883326524496079,
         
     | 
| 633 | 
         
            +
                  "num_tokens": 8850332.0,
         
     | 
| 634 | 
         
             
                  "step": 1650
         
     | 
| 635 | 
         
             
                },
         
     | 
| 636 | 
         
             
                {
         
     | 
| 637 | 
         
             
                  "epoch": 4.0,
         
     | 
| 638 | 
         
            +
                  "eval_loss": 0.06247411295771599,
         
     | 
| 639 | 
         
            +
                  "eval_mean_token_accuracy": 0.9854503702472996,
         
     | 
| 640 | 
         
             
                  "eval_num_tokens": 8894052.0,
         
     | 
| 641 | 
         
            +
                  "eval_runtime": 60.5535,
         
     | 
| 642 | 
         
            +
                  "eval_samples_per_second": 6.094,
         
     | 
| 643 | 
         
            +
                  "eval_steps_per_second": 3.055,
         
     | 
| 644 | 
         
             
                  "step": 1660
         
     | 
| 645 | 
         
             
                },
         
     | 
| 646 | 
         
             
                {
         
     | 
| 647 | 
         
             
                  "epoch": 4.036210018105009,
         
     | 
| 648 | 
         
            +
                  "grad_norm": 0.1120341494679451,
         
     | 
| 649 | 
         
            +
                  "learning_rate": 2.8455006260147228e-05,
         
     | 
| 650 | 
         
            +
                  "loss": 0.0407,
         
     | 
| 651 | 
         
            +
                  "mean_token_accuracy": 0.990142793385024,
         
     | 
| 652 | 
         
            +
                  "num_tokens": 8992280.0,
         
     | 
| 653 | 
         
             
                  "step": 1675
         
     | 
| 654 | 
         
             
                },
         
     | 
| 655 | 
         
             
                {
         
     | 
| 656 | 
         
             
                  "epoch": 4.096560048280024,
         
     | 
| 657 | 
         
            +
                  "grad_norm": 0.09372762590646744,
         
     | 
| 658 | 
         
            +
                  "learning_rate": 2.5117142794687618e-05,
         
     | 
| 659 | 
         
            +
                  "loss": 0.039,
         
     | 
| 660 | 
         
            +
                  "mean_token_accuracy": 0.990278902053833,
         
     | 
| 661 | 
         
            +
                  "num_tokens": 9121058.0,
         
     | 
| 662 | 
         
             
                  "step": 1700
         
     | 
| 663 | 
         
             
                },
         
     | 
| 664 | 
         
             
                {
         
     | 
| 665 | 
         
             
                  "epoch": 4.15691007845504,
         
     | 
| 666 | 
         
            +
                  "grad_norm": 0.15355007350444794,
         
     | 
| 667 | 
         
            +
                  "learning_rate": 2.1969549841768168e-05,
         
     | 
| 668 | 
         
            +
                  "loss": 0.0403,
         
     | 
| 669 | 
         
            +
                  "mean_token_accuracy": 0.9900617271661758,
         
     | 
| 670 | 
         
            +
                  "num_tokens": 9260440.0,
         
     | 
| 671 | 
         
             
                  "step": 1725
         
     | 
| 672 | 
         
             
                },
         
     | 
| 673 | 
         
             
                {
         
     | 
| 674 | 
         
             
                  "epoch": 4.217260108630055,
         
     | 
| 675 | 
         
            +
                  "grad_norm": 0.07104801386594772,
         
     | 
| 676 | 
         
            +
                  "learning_rate": 1.901702304858842e-05,
         
     | 
| 677 | 
         
            +
                  "loss": 0.0385,
         
     | 
| 678 | 
         
            +
                  "mean_token_accuracy": 0.9903567266464234,
         
     | 
| 679 | 
         
            +
                  "num_tokens": 9389750.0,
         
     | 
| 680 | 
         
             
                  "step": 1750
         
     | 
| 681 | 
         
             
                },
         
     | 
| 682 | 
         
             
                {
         
     | 
| 683 | 
         
             
                  "epoch": 4.27761013880507,
         
     | 
| 684 | 
         
            +
                  "grad_norm": 0.09815526008605957,
         
     | 
| 685 | 
         
            +
                  "learning_rate": 1.6264060861122442e-05,
         
     | 
| 686 | 
         
            +
                  "loss": 0.0406,
         
     | 
| 687 | 
         
            +
                  "mean_token_accuracy": 0.9897827422618866,
         
     | 
| 688 | 
         
            +
                  "num_tokens": 9529209.0,
         
     | 
| 689 | 
         
             
                  "step": 1775
         
     | 
| 690 | 
         
             
                },
         
     | 
| 691 | 
         
             
                {
         
     | 
| 692 | 
         
             
                  "epoch": 4.337960168980085,
         
     | 
| 693 | 
         
            +
                  "grad_norm": 0.07184392958879471,
         
     | 
| 694 | 
         
            +
                  "learning_rate": 1.3714857670322927e-05,
         
     | 
| 695 | 
         
            +
                  "loss": 0.0387,
         
     | 
| 696 | 
         
            +
                  "mean_token_accuracy": 0.9904780793190002,
         
     | 
| 697 | 
         
            +
                  "num_tokens": 9658250.0,
         
     | 
| 698 | 
         
             
                  "step": 1800
         
     | 
| 699 | 
         
             
                },
         
     | 
| 700 | 
         
             
                {
         
     | 
| 701 | 
         
             
                  "epoch": 4.3983101991551,
         
     | 
| 702 | 
         
            +
                  "grad_norm": 0.03805818408727646,
         
     | 
| 703 | 
         
            +
                  "learning_rate": 1.1373297421581129e-05,
         
     | 
| 704 | 
         
            +
                  "loss": 0.039,
         
     | 
| 705 | 
         
            +
                  "mean_token_accuracy": 0.9900361305475235,
         
     | 
| 706 | 
         
            +
                  "num_tokens": 9798481.0,
         
     | 
| 707 | 
         
             
                  "step": 1825
         
     | 
| 708 | 
         
             
                },
         
     | 
| 709 | 
         
             
                {
         
     | 
| 710 | 
         
             
                  "epoch": 4.458660229330115,
         
     | 
| 711 | 
         
            +
                  "grad_norm": 0.10860127955675125,
         
     | 
| 712 | 
         
            +
                  "learning_rate": 9.242947697178927e-06,
         
     | 
| 713 | 
         
            +
                  "loss": 0.0388,
         
     | 
| 714 | 
         
            +
                  "mean_token_accuracy": 0.9901992106437683,
         
     | 
| 715 | 
         
            +
                  "num_tokens": 9927421.0,
         
     | 
| 716 | 
         
             
                  "step": 1850
         
     | 
| 717 | 
         
             
                },
         
     | 
| 718 | 
         
             
                {
         
     | 
| 719 | 
         
             
                  "epoch": 4.51901025950513,
         
     | 
| 720 | 
         
            +
                  "grad_norm": 0.08905451744794846,
         
     | 
| 721 | 
         
            +
                  "learning_rate": 7.3270542807491675e-06,
         
     | 
| 722 | 
         
            +
                  "loss": 0.039,
         
     | 
| 723 | 
         
            +
                  "mean_token_accuracy": 0.9904217219352722,
         
     | 
| 724 | 
         
            +
                  "num_tokens": 10065813.0,
         
     | 
| 725 | 
         
             
                  "step": 1875
         
     | 
| 726 | 
         
             
                },
         
     | 
| 727 | 
         
             
                {
         
     | 
| 728 | 
         
             
                  "epoch": 4.579360289680145,
         
     | 
| 729 | 
         
            +
                  "grad_norm": 0.09814044833183289,
         
     | 
| 730 | 
         
            +
                  "learning_rate": 5.628536212026197e-06,
         
     | 
| 731 | 
         
            +
                  "loss": 0.0389,
         
     | 
| 732 | 
         
            +
                  "mean_token_accuracy": 0.9902842086553574,
         
     | 
| 733 | 
         
            +
                  "num_tokens": 10193933.0,
         
     | 
| 734 | 
         
             
                  "step": 1900
         
     | 
| 735 | 
         
             
                },
         
     | 
| 736 | 
         
             
                {
         
     | 
| 737 | 
         
             
                  "epoch": 4.63971031985516,
         
     | 
| 738 | 
         
            +
                  "grad_norm": 0.08526572585105896,
         
     | 
| 739 | 
         
            +
                  "learning_rate": 4.149981339420344e-06,
         
     | 
| 740 | 
         
            +
                  "loss": 0.0391,
         
     | 
| 741 | 
         
            +
                  "mean_token_accuracy": 0.9901294547319412,
         
     | 
| 742 | 
         
            +
                  "num_tokens": 10333978.0,
         
     | 
| 743 | 
         
             
                  "step": 1925
         
     | 
| 744 | 
         
             
                },
         
     | 
| 745 | 
         
             
                {
         
     | 
| 746 | 
         
             
                  "epoch": 4.700060350030175,
         
     | 
| 747 | 
         
            +
                  "grad_norm": 0.07220665365457535,
         
     | 
| 748 | 
         
            +
                  "learning_rate": 2.8936423771929897e-06,
         
     | 
| 749 | 
         
            +
                  "loss": 0.0388,
         
     | 
| 750 | 
         
            +
                  "mean_token_accuracy": 0.9902608853578567,
         
     | 
| 751 | 
         
            +
                  "num_tokens": 10462278.0,
         
     | 
| 752 | 
         
             
                  "step": 1950
         
     | 
| 753 | 
         
             
                },
         
     | 
| 754 | 
         
             
                {
         
     | 
| 755 | 
         
             
                  "epoch": 4.76041038020519,
         
     | 
| 756 | 
         
            +
                  "grad_norm": 0.06212342530488968,
         
     | 
| 757 | 
         
            +
                  "learning_rate": 1.8614334732393544e-06,
         
     | 
| 758 | 
         
            +
                  "loss": 0.0411,
         
     | 
| 759 | 
         
            +
                  "mean_token_accuracy": 0.9896285820007324,
         
     | 
| 760 | 
         
            +
                  "num_tokens": 10600837.0,
         
     | 
| 761 | 
         
             
                  "step": 1975
         
     | 
| 762 | 
         
             
                },
         
     | 
| 763 | 
         
             
                {
         
     | 
| 764 | 
         
             
                  "epoch": 4.820760410380205,
         
     | 
| 765 | 
         
            +
                  "grad_norm": 0.06338857114315033,
         
     | 
| 766 | 
         
            +
                  "learning_rate": 1.0549272927081964e-06,
         
     | 
| 767 | 
         
            +
                  "loss": 0.0387,
         
     | 
| 768 | 
         
            +
                  "mean_token_accuracy": 0.9904201912879944,
         
     | 
| 769 | 
         
            +
                  "num_tokens": 10730440.0,
         
     | 
| 770 | 
         
             
                  "step": 2000
         
     | 
| 771 | 
         
             
                },
         
     | 
| 772 | 
         
             
                {
         
     | 
| 773 | 
         
             
                  "epoch": 4.88111044055522,
         
     | 
| 774 | 
         
            +
                  "grad_norm": 0.07437903434038162,
         
     | 
| 775 | 
         
            +
                  "learning_rate": 4.753526219018755e-07,
         
     | 
| 776 | 
         
            +
                  "loss": 0.0394,
         
     | 
| 777 | 
         
            +
                  "mean_token_accuracy": 0.9902550059556962,
         
     | 
| 778 | 
         
            +
                  "num_tokens": 10868096.0,
         
     | 
| 779 | 
         
             
                  "step": 2025
         
     | 
| 780 | 
         
             
                },
         
     | 
| 781 | 
         
             
                {
         
     | 
| 782 | 
         
             
                  "epoch": 4.941460470730235,
         
     | 
| 783 | 
         
            +
                  "grad_norm": 0.04579373076558113,
         
     | 
| 784 | 
         
            +
                  "learning_rate": 1.235924961075496e-07,
         
     | 
| 785 | 
         
            +
                  "loss": 0.0376,
         
     | 
| 786 | 
         
            +
                  "mean_token_accuracy": 0.9905123418569565,
         
     | 
| 787 | 
         
            +
                  "num_tokens": 10997848.0,
         
     | 
| 788 | 
         
             
                  "step": 2050
         
     | 
| 789 | 
         
             
                },
         
     | 
| 790 | 
         
             
                {
         
     | 
| 791 | 
         
             
                  "epoch": 5.0,
         
     | 
| 792 | 
         
            +
                  "grad_norm": 0.4054584205150604,
         
     | 
| 793 | 
         
            +
                  "learning_rate": 1.8285421163888313e-10,
         
     | 
| 794 | 
         
            +
                  "loss": 0.0427,
         
     | 
| 795 | 
         
            +
                  "mean_token_accuracy": 0.989623335833402,
         
     | 
| 796 | 
         
             
                  "num_tokens": 11117565.0,
         
     | 
| 797 | 
         
             
                  "step": 2075
         
     | 
| 798 | 
         
             
                },
         
     | 
| 799 | 
         
             
                {
         
     | 
| 800 | 
         
             
                  "epoch": 5.0,
         
     | 
| 801 | 
         
            +
                  "eval_loss": 0.06487765908241272,
         
     | 
| 802 | 
         
            +
                  "eval_mean_token_accuracy": 0.9856334386645137,
         
     | 
| 803 | 
         
             
                  "eval_num_tokens": 11117565.0,
         
     | 
| 804 | 
         
            +
                  "eval_runtime": 60.3982,
         
     | 
| 805 | 
         
            +
                  "eval_samples_per_second": 6.109,
         
     | 
| 806 | 
         
            +
                  "eval_steps_per_second": 3.063,
         
     | 
| 807 | 
         
             
                  "step": 2075
         
     | 
| 808 | 
         
             
                }
         
     | 
| 809 | 
         
             
              ],
         
     | 
| 810 | 
         
             
              "logging_steps": 25,
         
     | 
| 811 | 
         
            +
              "max_steps": 2075,
         
     | 
| 812 | 
         
             
              "num_input_tokens_seen": 0,
         
     | 
| 813 | 
         
            +
              "num_train_epochs": 5,
         
     | 
| 814 | 
         
             
              "save_steps": 500,
         
     | 
| 815 | 
         
             
              "stateful_callbacks": {
         
     | 
| 816 | 
         
             
                "TrainerControl": {
         
     | 
| 
         | 
|
| 819 | 
         
             
                    "should_evaluate": false,
         
     | 
| 820 | 
         
             
                    "should_log": false,
         
     | 
| 821 | 
         
             
                    "should_save": true,
         
     | 
| 822 | 
         
            +
                    "should_training_stop": true
         
     | 
| 823 | 
         
             
                  },
         
     | 
| 824 | 
         
             
                  "attributes": {}
         
     | 
| 825 | 
         
             
                }
         
     | 
| 826 | 
         
             
              },
         
     | 
| 827 | 
         
            +
              "total_flos": 4.8081901975415194e+17,
         
     | 
| 828 | 
         
             
              "train_batch_size": 2,
         
     | 
| 829 | 
         
             
              "trial_name": null,
         
     | 
| 830 | 
         
             
              "trial_params": null
         
     | 
    	
        checkpoint-2075/training_args.bin
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 6033
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:51ece4ed6b1462de05ca804e04b783f884883c31cae5c545b5f19f6192d34a62
         
     | 
| 3 | 
         
             
            size 6033
         
     | 
    	
        checkpoint-415/adapter_config.json
    CHANGED
    
    | 
         @@ -25,12 +25,12 @@ 
     | 
|
| 25 | 
         
             
              "revision": null,
         
     | 
| 26 | 
         
             
              "target_modules": [
         
     | 
| 27 | 
         
             
                "gate_proj",
         
     | 
| 28 | 
         
            -
                " 
     | 
| 
         | 
|
| 29 | 
         
             
                "down_proj",
         
     | 
| 30 | 
         
             
                "o_proj",
         
     | 
| 31 | 
         
            -
                " 
     | 
| 32 | 
         
            -
                "up_proj" 
     | 
| 33 | 
         
            -
                "q_proj"
         
     | 
| 34 | 
         
             
              ],
         
     | 
| 35 | 
         
             
              "task_type": "CAUSAL_LM",
         
     | 
| 36 | 
         
             
              "trainable_token_indices": null,
         
     | 
| 
         | 
|
| 25 | 
         
             
              "revision": null,
         
     | 
| 26 | 
         
             
              "target_modules": [
         
     | 
| 27 | 
         
             
                "gate_proj",
         
     | 
| 28 | 
         
            +
                "v_proj",
         
     | 
| 29 | 
         
            +
                "q_proj",
         
     | 
| 30 | 
         
             
                "down_proj",
         
     | 
| 31 | 
         
             
                "o_proj",
         
     | 
| 32 | 
         
            +
                "k_proj",
         
     | 
| 33 | 
         
            +
                "up_proj"
         
     | 
| 
         | 
|
| 34 | 
         
             
              ],
         
     | 
| 35 | 
         
             
              "task_type": "CAUSAL_LM",
         
     | 
| 36 | 
         
             
              "trainable_token_indices": null,
         
     | 
    	
        checkpoint-415/adapter_model.safetensors
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 335604696
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:5bf5ae675205294eeba5d11ac731019e5b0219a04eafcb11d20c0499c6b861ee
         
     | 
| 3 | 
         
             
            size 335604696
         
     | 
    	
        checkpoint-415/optimizer.pt
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 671365003
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:183fae74a8296951631da0846dcb90052e47766f04a8cc0265ae7b68b64953b7
         
     | 
| 3 | 
         
             
            size 671365003
         
     | 
    	
        checkpoint-415/scheduler.pt
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 1465
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:311f04ff148a0643da9dd6ab46cae0944077a3236a48d5f9c8c146cef8cdb57f
         
     | 
| 3 | 
         
             
            size 1465
         
     | 
    	
        checkpoint-415/trainer_state.json
    CHANGED
    
    | 
         @@ -11,163 +11,163 @@ 
     | 
|
| 11 | 
         
             
              "log_history": [
         
     | 
| 12 | 
         
             
                {
         
     | 
| 13 | 
         
             
                  "epoch": 0.060350030175015085,
         
     | 
| 14 | 
         
            -
                  "grad_norm": 0. 
     | 
| 15 | 
         
            -
                  "learning_rate":  
     | 
| 16 | 
         
            -
                  "loss": 1. 
     | 
| 17 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 18 | 
         
            -
                  "num_tokens":  
     | 
| 19 | 
         
             
                  "step": 25
         
     | 
| 20 | 
         
             
                },
         
     | 
| 21 | 
         
             
                {
         
     | 
| 22 | 
         
             
                  "epoch": 0.12070006035003017,
         
     | 
| 23 | 
         
            -
                  "grad_norm": 0. 
     | 
| 24 | 
         
            -
                  "learning_rate": 0. 
     | 
| 25 | 
         
            -
                  "loss": 0. 
     | 
| 26 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 27 | 
         
            -
                  "num_tokens":  
     | 
| 28 | 
         
             
                  "step": 50
         
     | 
| 29 | 
         
             
                },
         
     | 
| 30 | 
         
             
                {
         
     | 
| 31 | 
         
             
                  "epoch": 0.18105009052504525,
         
     | 
| 32 | 
         
            -
                  "grad_norm": 0. 
     | 
| 33 | 
         
            -
                  "learning_rate": 0. 
     | 
| 34 | 
         
            -
                  "loss": 0. 
     | 
| 35 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 36 | 
         
            -
                  "num_tokens":  
     | 
| 37 | 
         
             
                  "step": 75
         
     | 
| 38 | 
         
             
                },
         
     | 
| 39 | 
         
             
                {
         
     | 
| 40 | 
         
             
                  "epoch": 0.24140012070006034,
         
     | 
| 41 | 
         
            -
                  "grad_norm": 0. 
     | 
| 42 | 
         
            -
                  "learning_rate": 0. 
     | 
| 43 | 
         
            -
                  "loss": 0. 
     | 
| 44 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 45 | 
         
            -
                  "num_tokens":  
     | 
| 46 | 
         
             
                  "step": 100
         
     | 
| 47 | 
         
             
                },
         
     | 
| 48 | 
         
             
                {
         
     | 
| 49 | 
         
             
                  "epoch": 0.30175015087507545,
         
     | 
| 50 | 
         
            -
                  "grad_norm": 0. 
     | 
| 51 | 
         
            -
                  "learning_rate": 0. 
     | 
| 52 | 
         
            -
                  "loss": 0. 
     | 
| 53 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 54 | 
         
            -
                  "num_tokens":  
     | 
| 55 | 
         
             
                  "step": 125
         
     | 
| 56 | 
         
             
                },
         
     | 
| 57 | 
         
             
                {
         
     | 
| 58 | 
         
             
                  "epoch": 0.3621001810500905,
         
     | 
| 59 | 
         
            -
                  "grad_norm":  
     | 
| 60 | 
         
            -
                  "learning_rate": 0. 
     | 
| 61 | 
         
            -
                  "loss": 0. 
     | 
| 62 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 63 | 
         
            -
                  "num_tokens":  
     | 
| 64 | 
         
             
                  "step": 150
         
     | 
| 65 | 
         
             
                },
         
     | 
| 66 | 
         
             
                {
         
     | 
| 67 | 
         
             
                  "epoch": 0.4224502112251056,
         
     | 
| 68 | 
         
            -
                  "grad_norm": 0. 
     | 
| 69 | 
         
            -
                  "learning_rate": 0. 
     | 
| 70 | 
         
            -
                  "loss": 0. 
     | 
| 71 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 72 | 
         
            -
                  "num_tokens":  
     | 
| 73 | 
         
             
                  "step": 175
         
     | 
| 74 | 
         
             
                },
         
     | 
| 75 | 
         
             
                {
         
     | 
| 76 | 
         
             
                  "epoch": 0.4828002414001207,
         
     | 
| 77 | 
         
            -
                  "grad_norm": 0. 
     | 
| 78 | 
         
            -
                  "learning_rate": 0. 
     | 
| 79 | 
         
            -
                  "loss": 0. 
     | 
| 80 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 81 | 
         
            -
                  "num_tokens":  
     | 
| 82 | 
         
             
                  "step": 200
         
     | 
| 83 | 
         
             
                },
         
     | 
| 84 | 
         
             
                {
         
     | 
| 85 | 
         
             
                  "epoch": 0.5431502715751357,
         
     | 
| 86 | 
         
            -
                  "grad_norm": 0. 
     | 
| 87 | 
         
            -
                  "learning_rate": 0. 
     | 
| 88 | 
         
            -
                  "loss": 0. 
     | 
| 89 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 90 | 
         
            -
                  "num_tokens":  
     | 
| 91 | 
         
             
                  "step": 225
         
     | 
| 92 | 
         
             
                },
         
     | 
| 93 | 
         
             
                {
         
     | 
| 94 | 
         
             
                  "epoch": 0.6035003017501509,
         
     | 
| 95 | 
         
            -
                  "grad_norm": 0. 
     | 
| 96 | 
         
            -
                  "learning_rate": 0. 
     | 
| 97 | 
         
            -
                  "loss": 0. 
     | 
| 98 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 99 | 
         
            -
                  "num_tokens":  
     | 
| 100 | 
         
             
                  "step": 250
         
     | 
| 101 | 
         
             
                },
         
     | 
| 102 | 
         
             
                {
         
     | 
| 103 | 
         
             
                  "epoch": 0.663850331925166,
         
     | 
| 104 | 
         
            -
                  "grad_norm": 0. 
     | 
| 105 | 
         
            -
                  "learning_rate": 0. 
     | 
| 106 | 
         
            -
                  "loss": 0. 
     | 
| 107 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 108 | 
         
            -
                  "num_tokens":  
     | 
| 109 | 
         
             
                  "step": 275
         
     | 
| 110 | 
         
             
                },
         
     | 
| 111 | 
         
             
                {
         
     | 
| 112 | 
         
             
                  "epoch": 0.724200362100181,
         
     | 
| 113 | 
         
            -
                  "grad_norm": 0. 
     | 
| 114 | 
         
            -
                  "learning_rate": 0. 
     | 
| 115 | 
         
            -
                  "loss": 0. 
     | 
| 116 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 117 | 
         
            -
                  "num_tokens":  
     | 
| 118 | 
         
             
                  "step": 300
         
     | 
| 119 | 
         
             
                },
         
     | 
| 120 | 
         
             
                {
         
     | 
| 121 | 
         
             
                  "epoch": 0.7845503922751962,
         
     | 
| 122 | 
         
            -
                  "grad_norm": 0. 
     | 
| 123 | 
         
            -
                  "learning_rate": 0. 
     | 
| 124 | 
         
            -
                  "loss": 0. 
     | 
| 125 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 126 | 
         
            -
                  "num_tokens":  
     | 
| 127 | 
         
             
                  "step": 325
         
     | 
| 128 | 
         
             
                },
         
     | 
| 129 | 
         
             
                {
         
     | 
| 130 | 
         
             
                  "epoch": 0.8449004224502112,
         
     | 
| 131 | 
         
            -
                  "grad_norm": 0. 
     | 
| 132 | 
         
            -
                  "learning_rate": 0. 
     | 
| 133 | 
         
            -
                  "loss": 0. 
     | 
| 134 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 135 | 
         
            -
                  "num_tokens":  
     | 
| 136 | 
         
             
                  "step": 350
         
     | 
| 137 | 
         
             
                },
         
     | 
| 138 | 
         
             
                {
         
     | 
| 139 | 
         
             
                  "epoch": 0.9052504526252263,
         
     | 
| 140 | 
         
            -
                  "grad_norm": 0. 
     | 
| 141 | 
         
            -
                  "learning_rate": 0. 
     | 
| 142 | 
         
            -
                  "loss": 0. 
     | 
| 143 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 144 | 
         
            -
                  "num_tokens":  
     | 
| 145 | 
         
             
                  "step": 375
         
     | 
| 146 | 
         
             
                },
         
     | 
| 147 | 
         
             
                {
         
     | 
| 148 | 
         
             
                  "epoch": 0.9656004828002414,
         
     | 
| 149 | 
         
            -
                  "grad_norm": 0. 
     | 
| 150 | 
         
            -
                  "learning_rate": 0. 
     | 
| 151 | 
         
            -
                  "loss": 0. 
     | 
| 152 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 153 | 
         
            -
                  "num_tokens":  
     | 
| 154 | 
         
             
                  "step": 400
         
     | 
| 155 | 
         
             
                },
         
     | 
| 156 | 
         
             
                {
         
     | 
| 157 | 
         
             
                  "epoch": 1.0,
         
     | 
| 158 | 
         
            -
                  "eval_loss": 0. 
     | 
| 159 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 160 | 
         
             
                  "eval_num_tokens": 2223513.0,
         
     | 
| 161 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 162 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 163 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 164 | 
         
             
                  "step": 415
         
     | 
| 165 | 
         
             
                }
         
     | 
| 166 | 
         
             
              ],
         
     | 
| 167 | 
         
             
              "logging_steps": 25,
         
     | 
| 168 | 
         
            -
              "max_steps":  
     | 
| 169 | 
         
             
              "num_input_tokens_seen": 0,
         
     | 
| 170 | 
         
            -
              "num_train_epochs":  
     | 
| 171 | 
         
             
              "save_steps": 500,
         
     | 
| 172 | 
         
             
              "stateful_callbacks": {
         
     | 
| 173 | 
         
             
                "TrainerControl": {
         
     | 
| 
         @@ -181,7 +181,7 @@ 
     | 
|
| 181 | 
         
             
                  "attributes": {}
         
     | 
| 182 | 
         
             
                }
         
     | 
| 183 | 
         
             
              },
         
     | 
| 184 | 
         
            -
              "total_flos": 9. 
     | 
| 185 | 
         
             
              "train_batch_size": 2,
         
     | 
| 186 | 
         
             
              "trial_name": null,
         
     | 
| 187 | 
         
             
              "trial_params": null
         
     | 
| 
         | 
|
| 11 | 
         
             
              "log_history": [
         
     | 
| 12 | 
         
             
                {
         
     | 
| 13 | 
         
             
                  "epoch": 0.060350030175015085,
         
     | 
| 14 | 
         
            +
                  "grad_norm": 0.7244853377342224,
         
     | 
| 15 | 
         
            +
                  "learning_rate": 0.00011428571428571427,
         
     | 
| 16 | 
         
            +
                  "loss": 1.5091,
         
     | 
| 17 | 
         
            +
                  "mean_token_accuracy": 0.6793113535642624,
         
     | 
| 18 | 
         
            +
                  "num_tokens": 152165.0,
         
     | 
| 19 | 
         
             
                  "step": 25
         
     | 
| 20 | 
         
             
                },
         
     | 
| 21 | 
         
             
                {
         
     | 
| 22 | 
         
             
                  "epoch": 0.12070006035003017,
         
     | 
| 23 | 
         
            +
                  "grad_norm": 0.8389242887496948,
         
     | 
| 24 | 
         
            +
                  "learning_rate": 0.0002333333333333333,
         
     | 
| 25 | 
         
            +
                  "loss": 0.8436,
         
     | 
| 26 | 
         
            +
                  "mean_token_accuracy": 0.7881802421808243,
         
     | 
| 27 | 
         
            +
                  "num_tokens": 267390.0,
         
     | 
| 28 | 
         
             
                  "step": 50
         
     | 
| 29 | 
         
             
                },
         
     | 
| 30 | 
         
             
                {
         
     | 
| 31 | 
         
             
                  "epoch": 0.18105009052504525,
         
     | 
| 32 | 
         
            +
                  "grad_norm": 0.6344988942146301,
         
     | 
| 33 | 
         
            +
                  "learning_rate": 0.00029997787517981614,
         
     | 
| 34 | 
         
            +
                  "loss": 0.5527,
         
     | 
| 35 | 
         
            +
                  "mean_token_accuracy": 0.8469069242477417,
         
     | 
| 36 | 
         
            +
                  "num_tokens": 420975.0,
         
     | 
| 37 | 
         
             
                  "step": 75
         
     | 
| 38 | 
         
             
                },
         
     | 
| 39 | 
         
             
                {
         
     | 
| 40 | 
         
             
                  "epoch": 0.24140012070006034,
         
     | 
| 41 | 
         
            +
                  "grad_norm": 0.7947192192077637,
         
     | 
| 42 | 
         
            +
                  "learning_rate": 0.0002997630832860032,
         
     | 
| 43 | 
         
            +
                  "loss": 0.4522,
         
     | 
| 44 | 
         
            +
                  "mean_token_accuracy": 0.870941441655159,
         
     | 
| 45 | 
         
            +
                  "num_tokens": 538380.0,
         
     | 
| 46 | 
         
             
                  "step": 100
         
     | 
| 47 | 
         
             
                },
         
     | 
| 48 | 
         
             
                {
         
     | 
| 49 | 
         
             
                  "epoch": 0.30175015087507545,
         
     | 
| 50 | 
         
            +
                  "grad_norm": 0.43716728687286377,
         
     | 
| 51 | 
         
            +
                  "learning_rate": 0.0002993201135681549,
         
     | 
| 52 | 
         
            +
                  "loss": 0.3049,
         
     | 
| 53 | 
         
            +
                  "mean_token_accuracy": 0.9136220461130142,
         
     | 
| 54 | 
         
            +
                  "num_tokens": 690650.0,
         
     | 
| 55 | 
         
             
                  "step": 125
         
     | 
| 56 | 
         
             
                },
         
     | 
| 57 | 
         
             
                {
         
     | 
| 58 | 
         
             
                  "epoch": 0.3621001810500905,
         
     | 
| 59 | 
         
            +
                  "grad_norm": 1.09097421169281,
         
     | 
| 60 | 
         
            +
                  "learning_rate": 0.0002986496409313553,
         
     | 
| 61 | 
         
            +
                  "loss": 0.3172,
         
     | 
| 62 | 
         
            +
                  "mean_token_accuracy": 0.91127048432827,
         
     | 
| 63 | 
         
            +
                  "num_tokens": 806066.0,
         
     | 
| 64 | 
         
             
                  "step": 150
         
     | 
| 65 | 
         
             
                },
         
     | 
| 66 | 
         
             
                {
         
     | 
| 67 | 
         
             
                  "epoch": 0.4224502112251056,
         
     | 
| 68 | 
         
            +
                  "grad_norm": 0.3773705065250397,
         
     | 
| 69 | 
         
            +
                  "learning_rate": 0.0002977526869022985,
         
     | 
| 70 | 
         
            +
                  "loss": 0.2029,
         
     | 
| 71 | 
         
            +
                  "mean_token_accuracy": 0.9433162885904313,
         
     | 
| 72 | 
         
            +
                  "num_tokens": 960853.0,
         
     | 
| 73 | 
         
             
                  "step": 175
         
     | 
| 74 | 
         
             
                },
         
     | 
| 75 | 
         
             
                {
         
     | 
| 76 | 
         
             
                  "epoch": 0.4828002414001207,
         
     | 
| 77 | 
         
            +
                  "grad_norm": 0.8292771577835083,
         
     | 
| 78 | 
         
            +
                  "learning_rate": 0.0002966306180728982,
         
     | 
| 79 | 
         
            +
                  "loss": 0.2274,
         
     | 
| 80 | 
         
            +
                  "mean_token_accuracy": 0.9385988712310791,
         
     | 
| 81 | 
         
            +
                  "num_tokens": 1077726.0,
         
     | 
| 82 | 
         
             
                  "step": 200
         
     | 
| 83 | 
         
             
                },
         
     | 
| 84 | 
         
             
                {
         
     | 
| 85 | 
         
             
                  "epoch": 0.5431502715751357,
         
     | 
| 86 | 
         
            +
                  "grad_norm": 0.4765889346599579,
         
     | 
| 87 | 
         
            +
                  "learning_rate": 0.0002952851440181598,
         
     | 
| 88 | 
         
            +
                  "loss": 0.19,
         
     | 
| 89 | 
         
            +
                  "mean_token_accuracy": 0.9479016721248626,
         
     | 
| 90 | 
         
            +
                  "num_tokens": 1232263.0,
         
     | 
| 91 | 
         
             
                  "step": 225
         
     | 
| 92 | 
         
             
                },
         
     | 
| 93 | 
         
             
                {
         
     | 
| 94 | 
         
             
                  "epoch": 0.6035003017501509,
         
     | 
| 95 | 
         
            +
                  "grad_norm": 0.9254749417304993,
         
     | 
| 96 | 
         
            +
                  "learning_rate": 0.0002937183146914856,
         
     | 
| 97 | 
         
            +
                  "loss": 0.1826,
         
     | 
| 98 | 
         
            +
                  "mean_token_accuracy": 0.9498224484920502,
         
     | 
| 99 | 
         
            +
                  "num_tokens": 1349057.0,
         
     | 
| 100 | 
         
             
                  "step": 250
         
     | 
| 101 | 
         
             
                },
         
     | 
| 102 | 
         
             
                {
         
     | 
| 103 | 
         
             
                  "epoch": 0.663850331925166,
         
     | 
| 104 | 
         
            +
                  "grad_norm": 0.4938018023967743,
         
     | 
| 105 | 
         
            +
                  "learning_rate": 0.000291932517301382,
         
     | 
| 106 | 
         
            +
                  "loss": 0.1497,
         
     | 
| 107 | 
         
            +
                  "mean_token_accuracy": 0.9588899296522141,
         
     | 
| 108 | 
         
            +
                  "num_tokens": 1496867.0,
         
     | 
| 109 | 
         
             
                  "step": 275
         
     | 
| 110 | 
         
             
                },
         
     | 
| 111 | 
         
             
                {
         
     | 
| 112 | 
         
             
                  "epoch": 0.724200362100181,
         
     | 
| 113 | 
         
            +
                  "grad_norm": 0.6995358467102051,
         
     | 
| 114 | 
         
            +
                  "learning_rate": 0.00028993047267432864,
         
     | 
| 115 | 
         
            +
                  "loss": 0.1578,
         
     | 
| 116 | 
         
            +
                  "mean_token_accuracy": 0.9568761509656906,
         
     | 
| 117 | 
         
            +
                  "num_tokens": 1610727.0,
         
     | 
| 118 | 
         
             
                  "step": 300
         
     | 
| 119 | 
         
             
                },
         
     | 
| 120 | 
         
             
                {
         
     | 
| 121 | 
         
             
                  "epoch": 0.7845503922751962,
         
     | 
| 122 | 
         
            +
                  "grad_norm": 0.46799567341804504,
         
     | 
| 123 | 
         
            +
                  "learning_rate": 0.0002877152311093483,
         
     | 
| 124 | 
         
            +
                  "loss": 0.1351,
         
     | 
| 125 | 
         
            +
                  "mean_token_accuracy": 0.9633717983961105,
         
     | 
| 126 | 
         
            +
                  "num_tokens": 1762041.0,
         
     | 
| 127 | 
         
             
                  "step": 325
         
     | 
| 128 | 
         
             
                },
         
     | 
| 129 | 
         
             
                {
         
     | 
| 130 | 
         
             
                  "epoch": 0.8449004224502112,
         
     | 
| 131 | 
         
            +
                  "grad_norm": 0.6729409098625183,
         
     | 
| 132 | 
         
            +
                  "learning_rate": 0.00028529016773059656,
         
     | 
| 133 | 
         
            +
                  "loss": 0.1206,
         
     | 
| 134 | 
         
            +
                  "mean_token_accuracy": 0.9687577307224273,
         
     | 
| 135 | 
         
            +
                  "num_tokens": 1877965.0,
         
     | 
| 136 | 
         
             
                  "step": 350
         
     | 
| 137 | 
         
             
                },
         
     | 
| 138 | 
         
             
                {
         
     | 
| 139 | 
         
             
                  "epoch": 0.9052504526252263,
         
     | 
| 140 | 
         
            +
                  "grad_norm": 0.5820412635803223,
         
     | 
| 141 | 
         
            +
                  "learning_rate": 0.00028265897734504976,
         
     | 
| 142 | 
         
            +
                  "loss": 0.1183,
         
     | 
| 143 | 
         
            +
                  "mean_token_accuracy": 0.96822787463665,
         
     | 
| 144 | 
         
            +
                  "num_tokens": 2028343.0,
         
     | 
| 145 | 
         
             
                  "step": 375
         
     | 
| 146 | 
         
             
                },
         
     | 
| 147 | 
         
             
                {
         
     | 
| 148 | 
         
             
                  "epoch": 0.9656004828002414,
         
     | 
| 149 | 
         
            +
                  "grad_norm": 0.8604497909545898,
         
     | 
| 150 | 
         
            +
                  "learning_rate": 0.0002798256688131267,
         
     | 
| 151 | 
         
            +
                  "loss": 0.1159,
         
     | 
| 152 | 
         
            +
                  "mean_token_accuracy": 0.9700725018978119,
         
     | 
| 153 | 
         
            +
                  "num_tokens": 2145044.0,
         
     | 
| 154 | 
         
             
                  "step": 400
         
     | 
| 155 | 
         
             
                },
         
     | 
| 156 | 
         
             
                {
         
     | 
| 157 | 
         
             
                  "epoch": 1.0,
         
     | 
| 158 | 
         
            +
                  "eval_loss": 0.1169130727648735,
         
     | 
| 159 | 
         
            +
                  "eval_mean_token_accuracy": 0.9691641559471955,
         
     | 
| 160 | 
         
             
                  "eval_num_tokens": 2223513.0,
         
     | 
| 161 | 
         
            +
                  "eval_runtime": 60.5832,
         
     | 
| 162 | 
         
            +
                  "eval_samples_per_second": 6.091,
         
     | 
| 163 | 
         
            +
                  "eval_steps_per_second": 3.054,
         
     | 
| 164 | 
         
             
                  "step": 415
         
     | 
| 165 | 
         
             
                }
         
     | 
| 166 | 
         
             
              ],
         
     | 
| 167 | 
         
             
              "logging_steps": 25,
         
     | 
| 168 | 
         
            +
              "max_steps": 2075,
         
     | 
| 169 | 
         
             
              "num_input_tokens_seen": 0,
         
     | 
| 170 | 
         
            +
              "num_train_epochs": 5,
         
     | 
| 171 | 
         
             
              "save_steps": 500,
         
     | 
| 172 | 
         
             
              "stateful_callbacks": {
         
     | 
| 173 | 
         
             
                "TrainerControl": {
         
     | 
| 
         | 
|
| 181 | 
         
             
                  "attributes": {}
         
     | 
| 182 | 
         
             
                }
         
     | 
| 183 | 
         
             
              },
         
     | 
| 184 | 
         
            +
              "total_flos": 9.616238744506368e+16,
         
     | 
| 185 | 
         
             
              "train_batch_size": 2,
         
     | 
| 186 | 
         
             
              "trial_name": null,
         
     | 
| 187 | 
         
             
              "trial_params": null
         
     | 
    	
        checkpoint-415/training_args.bin
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 6033
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:51ece4ed6b1462de05ca804e04b783f884883c31cae5c545b5f19f6192d34a62
         
     | 
| 3 | 
         
             
            size 6033
         
     | 
    	
        checkpoint-830/adapter_config.json
    CHANGED
    
    | 
         @@ -25,12 +25,12 @@ 
     | 
|
| 25 | 
         
             
              "revision": null,
         
     | 
| 26 | 
         
             
              "target_modules": [
         
     | 
| 27 | 
         
             
                "gate_proj",
         
     | 
| 28 | 
         
            -
                " 
     | 
| 
         | 
|
| 29 | 
         
             
                "down_proj",
         
     | 
| 30 | 
         
             
                "o_proj",
         
     | 
| 31 | 
         
            -
                " 
     | 
| 32 | 
         
            -
                "up_proj" 
     | 
| 33 | 
         
            -
                "q_proj"
         
     | 
| 34 | 
         
             
              ],
         
     | 
| 35 | 
         
             
              "task_type": "CAUSAL_LM",
         
     | 
| 36 | 
         
             
              "trainable_token_indices": null,
         
     | 
| 
         | 
|
| 25 | 
         
             
              "revision": null,
         
     | 
| 26 | 
         
             
              "target_modules": [
         
     | 
| 27 | 
         
             
                "gate_proj",
         
     | 
| 28 | 
         
            +
                "v_proj",
         
     | 
| 29 | 
         
            +
                "q_proj",
         
     | 
| 30 | 
         
             
                "down_proj",
         
     | 
| 31 | 
         
             
                "o_proj",
         
     | 
| 32 | 
         
            +
                "k_proj",
         
     | 
| 33 | 
         
            +
                "up_proj"
         
     | 
| 
         | 
|
| 34 | 
         
             
              ],
         
     | 
| 35 | 
         
             
              "task_type": "CAUSAL_LM",
         
     | 
| 36 | 
         
             
              "trainable_token_indices": null,
         
     | 
    	
        checkpoint-830/adapter_model.safetensors
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 335604696
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:f8d01a138277f59eca69e882d44422d570a51867a0e4aa786af14cc8feb79262
         
     | 
| 3 | 
         
             
            size 335604696
         
     | 
    	
        checkpoint-830/optimizer.pt
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 671365003
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:67939a1b4c40620345d19e45a7f2bd19f14e7e9469668bf8628b130b3ecc38a6
         
     | 
| 3 | 
         
             
            size 671365003
         
     | 
    	
        checkpoint-830/rng_state.pth
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 14645
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:a9856c7da9f14aa43e2ddd1bfa8ccdbb386e92bb3a630cc1dd4f810906eb72df
         
     | 
| 3 | 
         
             
            size 14645
         
     | 
    	
        checkpoint-830/scheduler.pt
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 1465
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:c9ebe35a4f401ad5492b7b53a3bf12f2d34ae11eae48d3c7b237ffa7851c70a5
         
     | 
| 3 | 
         
             
            size 1465
         
     | 
    	
        checkpoint-830/trainer_state.json
    CHANGED
    
    | 
         @@ -11,326 +11,326 @@ 
     | 
|
| 11 | 
         
             
              "log_history": [
         
     | 
| 12 | 
         
             
                {
         
     | 
| 13 | 
         
             
                  "epoch": 0.060350030175015085,
         
     | 
| 14 | 
         
            -
                  "grad_norm": 0. 
     | 
| 15 | 
         
            -
                  "learning_rate":  
     | 
| 16 | 
         
            -
                  "loss": 1. 
     | 
| 17 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 18 | 
         
            -
                  "num_tokens":  
     | 
| 19 | 
         
             
                  "step": 25
         
     | 
| 20 | 
         
             
                },
         
     | 
| 21 | 
         
             
                {
         
     | 
| 22 | 
         
             
                  "epoch": 0.12070006035003017,
         
     | 
| 23 | 
         
            -
                  "grad_norm": 0. 
     | 
| 24 | 
         
            -
                  "learning_rate": 0. 
     | 
| 25 | 
         
            -
                  "loss": 0. 
     | 
| 26 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 27 | 
         
            -
                  "num_tokens":  
     | 
| 28 | 
         
             
                  "step": 50
         
     | 
| 29 | 
         
             
                },
         
     | 
| 30 | 
         
             
                {
         
     | 
| 31 | 
         
             
                  "epoch": 0.18105009052504525,
         
     | 
| 32 | 
         
            -
                  "grad_norm": 0. 
     | 
| 33 | 
         
            -
                  "learning_rate": 0. 
     | 
| 34 | 
         
            -
                  "loss": 0. 
     | 
| 35 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 36 | 
         
            -
                  "num_tokens":  
     | 
| 37 | 
         
             
                  "step": 75
         
     | 
| 38 | 
         
             
                },
         
     | 
| 39 | 
         
             
                {
         
     | 
| 40 | 
         
             
                  "epoch": 0.24140012070006034,
         
     | 
| 41 | 
         
            -
                  "grad_norm": 0. 
     | 
| 42 | 
         
            -
                  "learning_rate": 0. 
     | 
| 43 | 
         
            -
                  "loss": 0. 
     | 
| 44 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 45 | 
         
            -
                  "num_tokens":  
     | 
| 46 | 
         
             
                  "step": 100
         
     | 
| 47 | 
         
             
                },
         
     | 
| 48 | 
         
             
                {
         
     | 
| 49 | 
         
             
                  "epoch": 0.30175015087507545,
         
     | 
| 50 | 
         
            -
                  "grad_norm": 0. 
     | 
| 51 | 
         
            -
                  "learning_rate": 0. 
     | 
| 52 | 
         
            -
                  "loss": 0. 
     | 
| 53 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 54 | 
         
            -
                  "num_tokens":  
     | 
| 55 | 
         
             
                  "step": 125
         
     | 
| 56 | 
         
             
                },
         
     | 
| 57 | 
         
             
                {
         
     | 
| 58 | 
         
             
                  "epoch": 0.3621001810500905,
         
     | 
| 59 | 
         
            -
                  "grad_norm":  
     | 
| 60 | 
         
            -
                  "learning_rate": 0. 
     | 
| 61 | 
         
            -
                  "loss": 0. 
     | 
| 62 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 63 | 
         
            -
                  "num_tokens":  
     | 
| 64 | 
         
             
                  "step": 150
         
     | 
| 65 | 
         
             
                },
         
     | 
| 66 | 
         
             
                {
         
     | 
| 67 | 
         
             
                  "epoch": 0.4224502112251056,
         
     | 
| 68 | 
         
            -
                  "grad_norm": 0. 
     | 
| 69 | 
         
            -
                  "learning_rate": 0. 
     | 
| 70 | 
         
            -
                  "loss": 0. 
     | 
| 71 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 72 | 
         
            -
                  "num_tokens":  
     | 
| 73 | 
         
             
                  "step": 175
         
     | 
| 74 | 
         
             
                },
         
     | 
| 75 | 
         
             
                {
         
     | 
| 76 | 
         
             
                  "epoch": 0.4828002414001207,
         
     | 
| 77 | 
         
            -
                  "grad_norm": 0. 
     | 
| 78 | 
         
            -
                  "learning_rate": 0. 
     | 
| 79 | 
         
            -
                  "loss": 0. 
     | 
| 80 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 81 | 
         
            -
                  "num_tokens":  
     | 
| 82 | 
         
             
                  "step": 200
         
     | 
| 83 | 
         
             
                },
         
     | 
| 84 | 
         
             
                {
         
     | 
| 85 | 
         
             
                  "epoch": 0.5431502715751357,
         
     | 
| 86 | 
         
            -
                  "grad_norm": 0. 
     | 
| 87 | 
         
            -
                  "learning_rate": 0. 
     | 
| 88 | 
         
            -
                  "loss": 0. 
     | 
| 89 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 90 | 
         
            -
                  "num_tokens":  
     | 
| 91 | 
         
             
                  "step": 225
         
     | 
| 92 | 
         
             
                },
         
     | 
| 93 | 
         
             
                {
         
     | 
| 94 | 
         
             
                  "epoch": 0.6035003017501509,
         
     | 
| 95 | 
         
            -
                  "grad_norm": 0. 
     | 
| 96 | 
         
            -
                  "learning_rate": 0. 
     | 
| 97 | 
         
            -
                  "loss": 0. 
     | 
| 98 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 99 | 
         
            -
                  "num_tokens":  
     | 
| 100 | 
         
             
                  "step": 250
         
     | 
| 101 | 
         
             
                },
         
     | 
| 102 | 
         
             
                {
         
     | 
| 103 | 
         
             
                  "epoch": 0.663850331925166,
         
     | 
| 104 | 
         
            -
                  "grad_norm": 0. 
     | 
| 105 | 
         
            -
                  "learning_rate": 0. 
     | 
| 106 | 
         
            -
                  "loss": 0. 
     | 
| 107 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 108 | 
         
            -
                  "num_tokens":  
     | 
| 109 | 
         
             
                  "step": 275
         
     | 
| 110 | 
         
             
                },
         
     | 
| 111 | 
         
             
                {
         
     | 
| 112 | 
         
             
                  "epoch": 0.724200362100181,
         
     | 
| 113 | 
         
            -
                  "grad_norm": 0. 
     | 
| 114 | 
         
            -
                  "learning_rate": 0. 
     | 
| 115 | 
         
            -
                  "loss": 0. 
     | 
| 116 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 117 | 
         
            -
                  "num_tokens":  
     | 
| 118 | 
         
             
                  "step": 300
         
     | 
| 119 | 
         
             
                },
         
     | 
| 120 | 
         
             
                {
         
     | 
| 121 | 
         
             
                  "epoch": 0.7845503922751962,
         
     | 
| 122 | 
         
            -
                  "grad_norm": 0. 
     | 
| 123 | 
         
            -
                  "learning_rate": 0. 
     | 
| 124 | 
         
            -
                  "loss": 0. 
     | 
| 125 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 126 | 
         
            -
                  "num_tokens":  
     | 
| 127 | 
         
             
                  "step": 325
         
     | 
| 128 | 
         
             
                },
         
     | 
| 129 | 
         
             
                {
         
     | 
| 130 | 
         
             
                  "epoch": 0.8449004224502112,
         
     | 
| 131 | 
         
            -
                  "grad_norm": 0. 
     | 
| 132 | 
         
            -
                  "learning_rate": 0. 
     | 
| 133 | 
         
            -
                  "loss": 0. 
     | 
| 134 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 135 | 
         
            -
                  "num_tokens":  
     | 
| 136 | 
         
             
                  "step": 350
         
     | 
| 137 | 
         
             
                },
         
     | 
| 138 | 
         
             
                {
         
     | 
| 139 | 
         
             
                  "epoch": 0.9052504526252263,
         
     | 
| 140 | 
         
            -
                  "grad_norm": 0. 
     | 
| 141 | 
         
            -
                  "learning_rate": 0. 
     | 
| 142 | 
         
            -
                  "loss": 0. 
     | 
| 143 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 144 | 
         
            -
                  "num_tokens":  
     | 
| 145 | 
         
             
                  "step": 375
         
     | 
| 146 | 
         
             
                },
         
     | 
| 147 | 
         
             
                {
         
     | 
| 148 | 
         
             
                  "epoch": 0.9656004828002414,
         
     | 
| 149 | 
         
            -
                  "grad_norm": 0. 
     | 
| 150 | 
         
            -
                  "learning_rate": 0. 
     | 
| 151 | 
         
            -
                  "loss": 0. 
     | 
| 152 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 153 | 
         
            -
                  "num_tokens":  
     | 
| 154 | 
         
             
                  "step": 400
         
     | 
| 155 | 
         
             
                },
         
     | 
| 156 | 
         
             
                {
         
     | 
| 157 | 
         
             
                  "epoch": 1.0,
         
     | 
| 158 | 
         
            -
                  "eval_loss": 0. 
     | 
| 159 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 160 | 
         
             
                  "eval_num_tokens": 2223513.0,
         
     | 
| 161 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 162 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 163 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 164 | 
         
             
                  "step": 415
         
     | 
| 165 | 
         
             
                },
         
     | 
| 166 | 
         
             
                {
         
     | 
| 167 | 
         
             
                  "epoch": 1.024140012070006,
         
     | 
| 168 | 
         
            -
                  "grad_norm": 0. 
     | 
| 169 | 
         
            -
                  "learning_rate": 0. 
     | 
| 170 | 
         
            -
                  "loss": 0. 
     | 
| 171 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 172 | 
         
            -
                  "num_tokens":  
     | 
| 173 | 
         
             
                  "step": 425
         
     | 
| 174 | 
         
             
                },
         
     | 
| 175 | 
         
             
                {
         
     | 
| 176 | 
         
             
                  "epoch": 1.0844900422450212,
         
     | 
| 177 | 
         
            -
                  "grad_norm": 0. 
     | 
| 178 | 
         
            -
                  "learning_rate": 0. 
     | 
| 179 | 
         
            -
                  "loss": 0. 
     | 
| 180 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 181 | 
         
            -
                  "num_tokens":  
     | 
| 182 | 
         
             
                  "step": 450
         
     | 
| 183 | 
         
             
                },
         
     | 
| 184 | 
         
             
                {
         
     | 
| 185 | 
         
             
                  "epoch": 1.1448400724200363,
         
     | 
| 186 | 
         
            -
                  "grad_norm": 0. 
     | 
| 187 | 
         
            -
                  "learning_rate": 0. 
     | 
| 188 | 
         
            -
                  "loss": 0. 
     | 
| 189 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 190 | 
         
            -
                  "num_tokens":  
     | 
| 191 | 
         
             
                  "step": 475
         
     | 
| 192 | 
         
             
                },
         
     | 
| 193 | 
         
             
                {
         
     | 
| 194 | 
         
             
                  "epoch": 1.2051901025950513,
         
     | 
| 195 | 
         
            -
                  "grad_norm": 0. 
     | 
| 196 | 
         
            -
                  "learning_rate": 0. 
     | 
| 197 | 
         
            -
                  "loss": 0. 
     | 
| 198 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 199 | 
         
            -
                  "num_tokens":  
     | 
| 200 | 
         
             
                  "step": 500
         
     | 
| 201 | 
         
             
                },
         
     | 
| 202 | 
         
             
                {
         
     | 
| 203 | 
         
             
                  "epoch": 1.2655401327700664,
         
     | 
| 204 | 
         
            -
                  "grad_norm": 0. 
     | 
| 205 | 
         
            -
                  "learning_rate": 0. 
     | 
| 206 | 
         
            -
                  "loss": 0. 
     | 
| 207 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 208 | 
         
            -
                  "num_tokens":  
     | 
| 209 | 
         
             
                  "step": 525
         
     | 
| 210 | 
         
             
                },
         
     | 
| 211 | 
         
             
                {
         
     | 
| 212 | 
         
             
                  "epoch": 1.3258901629450814,
         
     | 
| 213 | 
         
            -
                  "grad_norm": 0. 
     | 
| 214 | 
         
            -
                  "learning_rate": 0. 
     | 
| 215 | 
         
            -
                  "loss": 0. 
     | 
| 216 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 217 | 
         
            -
                  "num_tokens":  
     | 
| 218 | 
         
             
                  "step": 550
         
     | 
| 219 | 
         
             
                },
         
     | 
| 220 | 
         
             
                {
         
     | 
| 221 | 
         
             
                  "epoch": 1.3862401931200965,
         
     | 
| 222 | 
         
            -
                  "grad_norm": 0. 
     | 
| 223 | 
         
            -
                  "learning_rate": 0. 
     | 
| 224 | 
         
            -
                  "loss": 0. 
     | 
| 225 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 226 | 
         
            -
                  "num_tokens":  
     | 
| 227 | 
         
             
                  "step": 575
         
     | 
| 228 | 
         
             
                },
         
     | 
| 229 | 
         
             
                {
         
     | 
| 230 | 
         
             
                  "epoch": 1.4465902232951118,
         
     | 
| 231 | 
         
            -
                  "grad_norm": 0. 
     | 
| 232 | 
         
            -
                  "learning_rate": 0. 
     | 
| 233 | 
         
            -
                  "loss": 0. 
     | 
| 234 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 235 | 
         
            -
                  "num_tokens":  
     | 
| 236 | 
         
             
                  "step": 600
         
     | 
| 237 | 
         
             
                },
         
     | 
| 238 | 
         
             
                {
         
     | 
| 239 | 
         
             
                  "epoch": 1.5069402534701268,
         
     | 
| 240 | 
         
            -
                  "grad_norm": 0. 
     | 
| 241 | 
         
            -
                  "learning_rate": 0. 
     | 
| 242 | 
         
            -
                  "loss": 0. 
     | 
| 243 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 244 | 
         
            -
                  "num_tokens":  
     | 
| 245 | 
         
             
                  "step": 625
         
     | 
| 246 | 
         
             
                },
         
     | 
| 247 | 
         
             
                {
         
     | 
| 248 | 
         
             
                  "epoch": 1.567290283645142,
         
     | 
| 249 | 
         
            -
                  "grad_norm": 0. 
     | 
| 250 | 
         
            -
                  "learning_rate": 0. 
     | 
| 251 | 
         
            -
                  "loss": 0. 
     | 
| 252 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 253 | 
         
            -
                  "num_tokens":  
     | 
| 254 | 
         
             
                  "step": 650
         
     | 
| 255 | 
         
             
                },
         
     | 
| 256 | 
         
             
                {
         
     | 
| 257 | 
         
             
                  "epoch": 1.627640313820157,
         
     | 
| 258 | 
         
            -
                  "grad_norm": 0. 
     | 
| 259 | 
         
            -
                  "learning_rate": 0. 
     | 
| 260 | 
         
            -
                  "loss": 0. 
     | 
| 261 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 262 | 
         
            -
                  "num_tokens":  
     | 
| 263 | 
         
             
                  "step": 675
         
     | 
| 264 | 
         
             
                },
         
     | 
| 265 | 
         
             
                {
         
     | 
| 266 | 
         
             
                  "epoch": 1.687990343995172,
         
     | 
| 267 | 
         
            -
                  "grad_norm": 0. 
     | 
| 268 | 
         
            -
                  "learning_rate": 0. 
     | 
| 269 | 
         
            -
                  "loss": 0. 
     | 
| 270 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 271 | 
         
            -
                  "num_tokens":  
     | 
| 272 | 
         
             
                  "step": 700
         
     | 
| 273 | 
         
             
                },
         
     | 
| 274 | 
         
             
                {
         
     | 
| 275 | 
         
             
                  "epoch": 1.748340374170187,
         
     | 
| 276 | 
         
            -
                  "grad_norm": 0. 
     | 
| 277 | 
         
            -
                  "learning_rate": 0. 
     | 
| 278 | 
         
            -
                  "loss": 0. 
     | 
| 279 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 280 | 
         
            -
                  "num_tokens":  
     | 
| 281 | 
         
             
                  "step": 725
         
     | 
| 282 | 
         
             
                },
         
     | 
| 283 | 
         
             
                {
         
     | 
| 284 | 
         
             
                  "epoch": 1.8086904043452021,
         
     | 
| 285 | 
         
            -
                  "grad_norm": 0. 
     | 
| 286 | 
         
            -
                  "learning_rate": 0. 
     | 
| 287 | 
         
            -
                  "loss": 0. 
     | 
| 288 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 289 | 
         
            -
                  "num_tokens":  
     | 
| 290 | 
         
             
                  "step": 750
         
     | 
| 291 | 
         
             
                },
         
     | 
| 292 | 
         
             
                {
         
     | 
| 293 | 
         
             
                  "epoch": 1.8690404345202172,
         
     | 
| 294 | 
         
            -
                  "grad_norm": 0. 
     | 
| 295 | 
         
            -
                  "learning_rate": 0. 
     | 
| 296 | 
         
            -
                  "loss": 0. 
     | 
| 297 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 298 | 
         
            -
                  "num_tokens":  
     | 
| 299 | 
         
             
                  "step": 775
         
     | 
| 300 | 
         
             
                },
         
     | 
| 301 | 
         
             
                {
         
     | 
| 302 | 
         
             
                  "epoch": 1.9293904646952322,
         
     | 
| 303 | 
         
            -
                  "grad_norm": 0. 
     | 
| 304 | 
         
            -
                  "learning_rate": 0. 
     | 
| 305 | 
         
            -
                  "loss": 0. 
     | 
| 306 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 307 | 
         
            -
                  "num_tokens":  
     | 
| 308 | 
         
             
                  "step": 800
         
     | 
| 309 | 
         
             
                },
         
     | 
| 310 | 
         
             
                {
         
     | 
| 311 | 
         
             
                  "epoch": 1.9897404948702473,
         
     | 
| 312 | 
         
            -
                  "grad_norm": 0. 
     | 
| 313 | 
         
            -
                  "learning_rate": 0. 
     | 
| 314 | 
         
            -
                  "loss": 0. 
     | 
| 315 | 
         
            -
                  "mean_token_accuracy": 0. 
     | 
| 316 | 
         
            -
                  "num_tokens":  
     | 
| 317 | 
         
             
                  "step": 825
         
     | 
| 318 | 
         
             
                },
         
     | 
| 319 | 
         
             
                {
         
     | 
| 320 | 
         
             
                  "epoch": 2.0,
         
     | 
| 321 | 
         
            -
                  "eval_loss": 0. 
     | 
| 322 | 
         
            -
                  "eval_mean_token_accuracy": 0. 
     | 
| 323 | 
         
             
                  "eval_num_tokens": 4447026.0,
         
     | 
| 324 | 
         
            -
                  "eval_runtime": 60. 
     | 
| 325 | 
         
            -
                  "eval_samples_per_second": 6. 
     | 
| 326 | 
         
            -
                  "eval_steps_per_second": 3. 
     | 
| 327 | 
         
             
                  "step": 830
         
     | 
| 328 | 
         
             
                }
         
     | 
| 329 | 
         
             
              ],
         
     | 
| 330 | 
         
             
              "logging_steps": 25,
         
     | 
| 331 | 
         
            -
              "max_steps":  
     | 
| 332 | 
         
             
              "num_input_tokens_seen": 0,
         
     | 
| 333 | 
         
            -
              "num_train_epochs":  
     | 
| 334 | 
         
             
              "save_steps": 500,
         
     | 
| 335 | 
         
             
              "stateful_callbacks": {
         
     | 
| 336 | 
         
             
                "TrainerControl": {
         
     | 
| 
         @@ -344,7 +344,7 @@ 
     | 
|
| 344 | 
         
             
                  "attributes": {}
         
     | 
| 345 | 
         
             
                }
         
     | 
| 346 | 
         
             
              },
         
     | 
| 347 | 
         
            -
              "total_flos": 1. 
     | 
| 348 | 
         
             
              "train_batch_size": 2,
         
     | 
| 349 | 
         
             
              "trial_name": null,
         
     | 
| 350 | 
         
             
              "trial_params": null
         
     | 
| 
         | 
|
| 11 | 
         
             
              "log_history": [
         
     | 
| 12 | 
         
             
                {
         
     | 
| 13 | 
         
             
                  "epoch": 0.060350030175015085,
         
     | 
| 14 | 
         
            +
                  "grad_norm": 0.7244853377342224,
         
     | 
| 15 | 
         
            +
                  "learning_rate": 0.00011428571428571427,
         
     | 
| 16 | 
         
            +
                  "loss": 1.5091,
         
     | 
| 17 | 
         
            +
                  "mean_token_accuracy": 0.6793113535642624,
         
     | 
| 18 | 
         
            +
                  "num_tokens": 152165.0,
         
     | 
| 19 | 
         
             
                  "step": 25
         
     | 
| 20 | 
         
             
                },
         
     | 
| 21 | 
         
             
                {
         
     | 
| 22 | 
         
             
                  "epoch": 0.12070006035003017,
         
     | 
| 23 | 
         
            +
                  "grad_norm": 0.8389242887496948,
         
     | 
| 24 | 
         
            +
                  "learning_rate": 0.0002333333333333333,
         
     | 
| 25 | 
         
            +
                  "loss": 0.8436,
         
     | 
| 26 | 
         
            +
                  "mean_token_accuracy": 0.7881802421808243,
         
     | 
| 27 | 
         
            +
                  "num_tokens": 267390.0,
         
     | 
| 28 | 
         
             
                  "step": 50
         
     | 
| 29 | 
         
             
                },
         
     | 
| 30 | 
         
             
                {
         
     | 
| 31 | 
         
             
                  "epoch": 0.18105009052504525,
         
     | 
| 32 | 
         
            +
                  "grad_norm": 0.6344988942146301,
         
     | 
| 33 | 
         
            +
                  "learning_rate": 0.00029997787517981614,
         
     | 
| 34 | 
         
            +
                  "loss": 0.5527,
         
     | 
| 35 | 
         
            +
                  "mean_token_accuracy": 0.8469069242477417,
         
     | 
| 36 | 
         
            +
                  "num_tokens": 420975.0,
         
     | 
| 37 | 
         
             
                  "step": 75
         
     | 
| 38 | 
         
             
                },
         
     | 
| 39 | 
         
             
                {
         
     | 
| 40 | 
         
             
                  "epoch": 0.24140012070006034,
         
     | 
| 41 | 
         
            +
                  "grad_norm": 0.7947192192077637,
         
     | 
| 42 | 
         
            +
                  "learning_rate": 0.0002997630832860032,
         
     | 
| 43 | 
         
            +
                  "loss": 0.4522,
         
     | 
| 44 | 
         
            +
                  "mean_token_accuracy": 0.870941441655159,
         
     | 
| 45 | 
         
            +
                  "num_tokens": 538380.0,
         
     | 
| 46 | 
         
             
                  "step": 100
         
     | 
| 47 | 
         
             
                },
         
     | 
| 48 | 
         
             
                {
         
     | 
| 49 | 
         
             
                  "epoch": 0.30175015087507545,
         
     | 
| 50 | 
         
            +
                  "grad_norm": 0.43716728687286377,
         
     | 
| 51 | 
         
            +
                  "learning_rate": 0.0002993201135681549,
         
     | 
| 52 | 
         
            +
                  "loss": 0.3049,
         
     | 
| 53 | 
         
            +
                  "mean_token_accuracy": 0.9136220461130142,
         
     | 
| 54 | 
         
            +
                  "num_tokens": 690650.0,
         
     | 
| 55 | 
         
             
                  "step": 125
         
     | 
| 56 | 
         
             
                },
         
     | 
| 57 | 
         
             
                {
         
     | 
| 58 | 
         
             
                  "epoch": 0.3621001810500905,
         
     | 
| 59 | 
         
            +
                  "grad_norm": 1.09097421169281,
         
     | 
| 60 | 
         
            +
                  "learning_rate": 0.0002986496409313553,
         
     | 
| 61 | 
         
            +
                  "loss": 0.3172,
         
     | 
| 62 | 
         
            +
                  "mean_token_accuracy": 0.91127048432827,
         
     | 
| 63 | 
         
            +
                  "num_tokens": 806066.0,
         
     | 
| 64 | 
         
             
                  "step": 150
         
     | 
| 65 | 
         
             
                },
         
     | 
| 66 | 
         
             
                {
         
     | 
| 67 | 
         
             
                  "epoch": 0.4224502112251056,
         
     | 
| 68 | 
         
            +
                  "grad_norm": 0.3773705065250397,
         
     | 
| 69 | 
         
            +
                  "learning_rate": 0.0002977526869022985,
         
     | 
| 70 | 
         
            +
                  "loss": 0.2029,
         
     | 
| 71 | 
         
            +
                  "mean_token_accuracy": 0.9433162885904313,
         
     | 
| 72 | 
         
            +
                  "num_tokens": 960853.0,
         
     | 
| 73 | 
         
             
                  "step": 175
         
     | 
| 74 | 
         
             
                },
         
     | 
| 75 | 
         
             
                {
         
     | 
| 76 | 
         
             
                  "epoch": 0.4828002414001207,
         
     | 
| 77 | 
         
            +
                  "grad_norm": 0.8292771577835083,
         
     | 
| 78 | 
         
            +
                  "learning_rate": 0.0002966306180728982,
         
     | 
| 79 | 
         
            +
                  "loss": 0.2274,
         
     | 
| 80 | 
         
            +
                  "mean_token_accuracy": 0.9385988712310791,
         
     | 
| 81 | 
         
            +
                  "num_tokens": 1077726.0,
         
     | 
| 82 | 
         
             
                  "step": 200
         
     | 
| 83 | 
         
             
                },
         
     | 
| 84 | 
         
             
                {
         
     | 
| 85 | 
         
             
                  "epoch": 0.5431502715751357,
         
     | 
| 86 | 
         
            +
                  "grad_norm": 0.4765889346599579,
         
     | 
| 87 | 
         
            +
                  "learning_rate": 0.0002952851440181598,
         
     | 
| 88 | 
         
            +
                  "loss": 0.19,
         
     | 
| 89 | 
         
            +
                  "mean_token_accuracy": 0.9479016721248626,
         
     | 
| 90 | 
         
            +
                  "num_tokens": 1232263.0,
         
     | 
| 91 | 
         
             
                  "step": 225
         
     | 
| 92 | 
         
             
                },
         
     | 
| 93 | 
         
             
                {
         
     | 
| 94 | 
         
             
                  "epoch": 0.6035003017501509,
         
     | 
| 95 | 
         
            +
                  "grad_norm": 0.9254749417304993,
         
     | 
| 96 | 
         
            +
                  "learning_rate": 0.0002937183146914856,
         
     | 
| 97 | 
         
            +
                  "loss": 0.1826,
         
     | 
| 98 | 
         
            +
                  "mean_token_accuracy": 0.9498224484920502,
         
     | 
| 99 | 
         
            +
                  "num_tokens": 1349057.0,
         
     | 
| 100 | 
         
             
                  "step": 250
         
     | 
| 101 | 
         
             
                },
         
     | 
| 102 | 
         
             
                {
         
     | 
| 103 | 
         
             
                  "epoch": 0.663850331925166,
         
     | 
| 104 | 
         
            +
                  "grad_norm": 0.4938018023967743,
         
     | 
| 105 | 
         
            +
                  "learning_rate": 0.000291932517301382,
         
     | 
| 106 | 
         
            +
                  "loss": 0.1497,
         
     | 
| 107 | 
         
            +
                  "mean_token_accuracy": 0.9588899296522141,
         
     | 
| 108 | 
         
            +
                  "num_tokens": 1496867.0,
         
     | 
| 109 | 
         
             
                  "step": 275
         
     | 
| 110 | 
         
             
                },
         
     | 
| 111 | 
         
             
                {
         
     | 
| 112 | 
         
             
                  "epoch": 0.724200362100181,
         
     | 
| 113 | 
         
            +
                  "grad_norm": 0.6995358467102051,
         
     | 
| 114 | 
         
            +
                  "learning_rate": 0.00028993047267432864,
         
     | 
| 115 | 
         
            +
                  "loss": 0.1578,
         
     | 
| 116 | 
         
            +
                  "mean_token_accuracy": 0.9568761509656906,
         
     | 
| 117 | 
         
            +
                  "num_tokens": 1610727.0,
         
     | 
| 118 | 
         
             
                  "step": 300
         
     | 
| 119 | 
         
             
                },
         
     | 
| 120 | 
         
             
                {
         
     | 
| 121 | 
         
             
                  "epoch": 0.7845503922751962,
         
     | 
| 122 | 
         
            +
                  "grad_norm": 0.46799567341804504,
         
     | 
| 123 | 
         
            +
                  "learning_rate": 0.0002877152311093483,
         
     | 
| 124 | 
         
            +
                  "loss": 0.1351,
         
     | 
| 125 | 
         
            +
                  "mean_token_accuracy": 0.9633717983961105,
         
     | 
| 126 | 
         
            +
                  "num_tokens": 1762041.0,
         
     | 
| 127 | 
         
             
                  "step": 325
         
     | 
| 128 | 
         
             
                },
         
     | 
| 129 | 
         
             
                {
         
     | 
| 130 | 
         
             
                  "epoch": 0.8449004224502112,
         
     | 
| 131 | 
         
            +
                  "grad_norm": 0.6729409098625183,
         
     | 
| 132 | 
         
            +
                  "learning_rate": 0.00028529016773059656,
         
     | 
| 133 | 
         
            +
                  "loss": 0.1206,
         
     | 
| 134 | 
         
            +
                  "mean_token_accuracy": 0.9687577307224273,
         
     | 
| 135 | 
         
            +
                  "num_tokens": 1877965.0,
         
     | 
| 136 | 
         
             
                  "step": 350
         
     | 
| 137 | 
         
             
                },
         
     | 
| 138 | 
         
             
                {
         
     | 
| 139 | 
         
             
                  "epoch": 0.9052504526252263,
         
     | 
| 140 | 
         
            +
                  "grad_norm": 0.5820412635803223,
         
     | 
| 141 | 
         
            +
                  "learning_rate": 0.00028265897734504976,
         
     | 
| 142 | 
         
            +
                  "loss": 0.1183,
         
     | 
| 143 | 
         
            +
                  "mean_token_accuracy": 0.96822787463665,
         
     | 
| 144 | 
         
            +
                  "num_tokens": 2028343.0,
         
     | 
| 145 | 
         
             
                  "step": 375
         
     | 
| 146 | 
         
             
                },
         
     | 
| 147 | 
         
             
                {
         
     | 
| 148 | 
         
             
                  "epoch": 0.9656004828002414,
         
     | 
| 149 | 
         
            +
                  "grad_norm": 0.8604497909545898,
         
     | 
| 150 | 
         
            +
                  "learning_rate": 0.0002798256688131267,
         
     | 
| 151 | 
         
            +
                  "loss": 0.1159,
         
     | 
| 152 | 
         
            +
                  "mean_token_accuracy": 0.9700725018978119,
         
     | 
| 153 | 
         
            +
                  "num_tokens": 2145044.0,
         
     | 
| 154 | 
         
             
                  "step": 400
         
     | 
| 155 | 
         
             
                },
         
     | 
| 156 | 
         
             
                {
         
     | 
| 157 | 
         
             
                  "epoch": 1.0,
         
     | 
| 158 | 
         
            +
                  "eval_loss": 0.1169130727648735,
         
     | 
| 159 | 
         
            +
                  "eval_mean_token_accuracy": 0.9691641559471955,
         
     | 
| 160 | 
         
             
                  "eval_num_tokens": 2223513.0,
         
     | 
| 161 | 
         
            +
                  "eval_runtime": 60.5832,
         
     | 
| 162 | 
         
            +
                  "eval_samples_per_second": 6.091,
         
     | 
| 163 | 
         
            +
                  "eval_steps_per_second": 3.054,
         
     | 
| 164 | 
         
             
                  "step": 415
         
     | 
| 165 | 
         
             
                },
         
     | 
| 166 | 
         
             
                {
         
     | 
| 167 | 
         
             
                  "epoch": 1.024140012070006,
         
     | 
| 168 | 
         
            +
                  "grad_norm": 0.20096616446971893,
         
     | 
| 169 | 
         
            +
                  "learning_rate": 0.0002767945589408217,
         
     | 
| 170 | 
         
            +
                  "loss": 0.122,
         
     | 
| 171 | 
         
            +
                  "mean_token_accuracy": 0.9680000224064306,
         
     | 
| 172 | 
         
            +
                  "num_tokens": 2291746.0,
         
     | 
| 173 | 
         
             
                  "step": 425
         
     | 
| 174 | 
         
             
                },
         
     | 
| 175 | 
         
             
                {
         
     | 
| 176 | 
         
             
                  "epoch": 1.0844900422450212,
         
     | 
| 177 | 
         
            +
                  "grad_norm": 0.34665247797966003,
         
     | 
| 178 | 
         
            +
                  "learning_rate": 0.0002735702659026533,
         
     | 
| 179 | 
         
            +
                  "loss": 0.0836,
         
     | 
| 180 | 
         
            +
                  "mean_token_accuracy": 0.9780776232481003,
         
     | 
| 181 | 
         
            +
                  "num_tokens": 2424528.0,
         
     | 
| 182 | 
         
             
                  "step": 450
         
     | 
| 183 | 
         
             
                },
         
     | 
| 184 | 
         
             
                {
         
     | 
| 185 | 
         
             
                  "epoch": 1.1448400724200363,
         
     | 
| 186 | 
         
            +
                  "grad_norm": 0.30349963903427124,
         
     | 
| 187 | 
         
            +
                  "learning_rate": 0.0002701577022054515,
         
     | 
| 188 | 
         
            +
                  "loss": 0.1019,
         
     | 
| 189 | 
         
            +
                  "mean_token_accuracy": 0.9732917118072509,
         
     | 
| 190 | 
         
            +
                  "num_tokens": 2557091.0,
         
     | 
| 191 | 
         
             
                  "step": 475
         
     | 
| 192 | 
         
             
                },
         
     | 
| 193 | 
         
             
                {
         
     | 
| 194 | 
         
             
                  "epoch": 1.2051901025950513,
         
     | 
| 195 | 
         
            +
                  "grad_norm": 0.3892677426338196,
         
     | 
| 196 | 
         
            +
                  "learning_rate": 0.0002665620672037014,
         
     | 
| 197 | 
         
            +
                  "loss": 0.0831,
         
     | 
| 198 | 
         
            +
                  "mean_token_accuracy": 0.9782004028558731,
         
     | 
| 199 | 
         
            +
                  "num_tokens": 2691527.0,
         
     | 
| 200 | 
         
             
                  "step": 500
         
     | 
| 201 | 
         
             
                },
         
     | 
| 202 | 
         
             
                {
         
     | 
| 203 | 
         
             
                  "epoch": 1.2655401327700664,
         
     | 
| 204 | 
         
            +
                  "grad_norm": 0.29889699816703796,
         
     | 
| 205 | 
         
            +
                  "learning_rate": 0.0002627888391778493,
         
     | 
| 206 | 
         
            +
                  "loss": 0.1023,
         
     | 
| 207 | 
         
            +
                  "mean_token_accuracy": 0.9729781967401504,
         
     | 
| 208 | 
         
            +
                  "num_tokens": 2824699.0,
         
     | 
| 209 | 
         
             
                  "step": 525
         
     | 
| 210 | 
         
             
                },
         
     | 
| 211 | 
         
             
                {
         
     | 
| 212 | 
         
             
                  "epoch": 1.3258901629450814,
         
     | 
| 213 | 
         
            +
                  "grad_norm": 0.393573522567749,
         
     | 
| 214 | 
         
            +
                  "learning_rate": 0.0002588437669876384,
         
     | 
| 215 | 
         
            +
                  "loss": 0.0779,
         
     | 
| 216 | 
         
            +
                  "mean_token_accuracy": 0.9795191860198975,
         
     | 
| 217 | 
         
            +
                  "num_tokens": 2958826.0,
         
     | 
| 218 | 
         
             
                  "step": 550
         
     | 
| 219 | 
         
             
                },
         
     | 
| 220 | 
         
             
                {
         
     | 
| 221 | 
         
             
                  "epoch": 1.3862401931200965,
         
     | 
| 222 | 
         
            +
                  "grad_norm": 0.26299118995666504,
         
     | 
| 223 | 
         
            +
                  "learning_rate": 0.00025473286131319283,
         
     | 
| 224 | 
         
            +
                  "loss": 0.0988,
         
     | 
| 225 | 
         
            +
                  "mean_token_accuracy": 0.9739746767282486,
         
     | 
| 226 | 
         
            +
                  "num_tokens": 3092320.0,
         
     | 
| 227 | 
         
             
                  "step": 575
         
     | 
| 228 | 
         
             
                },
         
     | 
| 229 | 
         
             
                {
         
     | 
| 230 | 
         
             
                  "epoch": 1.4465902232951118,
         
     | 
| 231 | 
         
            +
                  "grad_norm": 0.3649594783782959,
         
     | 
| 232 | 
         
            +
                  "learning_rate": 0.0002504623854971937,
         
     | 
| 233 | 
         
            +
                  "loss": 0.0729,
         
     | 
| 234 | 
         
            +
                  "mean_token_accuracy": 0.9814109367132187,
         
     | 
| 235 | 
         
            +
                  "num_tokens": 3227452.0,
         
     | 
| 236 | 
         
             
                  "step": 600
         
     | 
| 237 | 
         
             
                },
         
     | 
| 238 | 
         
             
                {
         
     | 
| 239 | 
         
             
                  "epoch": 1.5069402534701268,
         
     | 
| 240 | 
         
            +
                  "grad_norm": 0.28632357716560364,
         
     | 
| 241 | 
         
            +
                  "learning_rate": 0.00024603884600210097,
         
     | 
| 242 | 
         
            +
                  "loss": 0.0957,
         
     | 
| 243 | 
         
            +
                  "mean_token_accuracy": 0.9748889011144638,
         
     | 
| 244 | 
         
            +
                  "num_tokens": 3361210.0,
         
     | 
| 245 | 
         
             
                  "step": 625
         
     | 
| 246 | 
         
             
                },
         
     | 
| 247 | 
         
             
                {
         
     | 
| 248 | 
         
             
                  "epoch": 1.567290283645142,
         
     | 
| 249 | 
         
            +
                  "grad_norm": 0.25492990016937256,
         
     | 
| 250 | 
         
            +
                  "learning_rate": 0.00024146898249695974,
         
     | 
| 251 | 
         
            +
                  "loss": 0.075,
         
     | 
| 252 | 
         
            +
                  "mean_token_accuracy": 0.9806595808267593,
         
     | 
| 253 | 
         
            +
                  "num_tokens": 3497177.0,
         
     | 
| 254 | 
         
             
                  "step": 650
         
     | 
| 255 | 
         
             
                },
         
     | 
| 256 | 
         
             
                {
         
     | 
| 257 | 
         
             
                  "epoch": 1.627640313820157,
         
     | 
| 258 | 
         
            +
                  "grad_norm": 0.37043872475624084,
         
     | 
| 259 | 
         
            +
                  "learning_rate": 0.00023675975758889506,
         
     | 
| 260 | 
         
            +
                  "loss": 0.0918,
         
     | 
| 261 | 
         
            +
                  "mean_token_accuracy": 0.9762868732213974,
         
     | 
| 262 | 
         
            +
                  "num_tokens": 3630834.0,
         
     | 
| 263 | 
         
             
                  "step": 675
         
     | 
| 264 | 
         
             
                },
         
     | 
| 265 | 
         
             
                {
         
     | 
| 266 | 
         
             
                  "epoch": 1.687990343995172,
         
     | 
| 267 | 
         
            +
                  "grad_norm": 0.26372411847114563,
         
     | 
| 268 | 
         
            +
                  "learning_rate": 0.00023191834621493968,
         
     | 
| 269 | 
         
            +
                  "loss": 0.0674,
         
     | 
| 270 | 
         
            +
                  "mean_token_accuracy": 0.9826526433229447,
         
     | 
| 271 | 
         
            +
                  "num_tokens": 3766598.0,
         
     | 
| 272 | 
         
             
                  "step": 700
         
     | 
| 273 | 
         
             
                },
         
     | 
| 274 | 
         
             
                {
         
     | 
| 275 | 
         
             
                  "epoch": 1.748340374170187,
         
     | 
| 276 | 
         
            +
                  "grad_norm": 0.2400335669517517,
         
     | 
| 277 | 
         
            +
                  "learning_rate": 0.00022695212471035816,
         
     | 
| 278 | 
         
            +
                  "loss": 0.0807,
         
     | 
| 279 | 
         
            +
                  "mean_token_accuracy": 0.9793906199932099,
         
     | 
| 280 | 
         
            +
                  "num_tokens": 3899644.0,
         
     | 
| 281 | 
         
             
                  "step": 725
         
     | 
| 282 | 
         
             
                },
         
     | 
| 283 | 
         
             
                {
         
     | 
| 284 | 
         
             
                  "epoch": 1.8086904043452021,
         
     | 
| 285 | 
         
            +
                  "grad_norm": 0.19833268225193024,
         
     | 
| 286 | 
         
            +
                  "learning_rate": 0.0002218686595701219,
         
     | 
| 287 | 
         
            +
                  "loss": 0.0655,
         
     | 
| 288 | 
         
            +
                  "mean_token_accuracy": 0.9832920217514038,
         
     | 
| 289 | 
         
            +
                  "num_tokens": 4036037.0,
         
     | 
| 290 | 
         
             
                  "step": 750
         
     | 
| 291 | 
         
             
                },
         
     | 
| 292 | 
         
             
                {
         
     | 
| 293 | 
         
             
                  "epoch": 1.8690404345202172,
         
     | 
| 294 | 
         
            +
                  "grad_norm": 0.17969554662704468,
         
     | 
| 295 | 
         
            +
                  "learning_rate": 0.0002166756959206587,
         
     | 
| 296 | 
         
            +
                  "loss": 0.0831,
         
     | 
| 297 | 
         
            +
                  "mean_token_accuracy": 0.9791438663005829,
         
     | 
| 298 | 
         
            +
                  "num_tokens": 4168035.0,
         
     | 
| 299 | 
         
             
                  "step": 775
         
     | 
| 300 | 
         
             
                },
         
     | 
| 301 | 
         
             
                {
         
     | 
| 302 | 
         
             
                  "epoch": 1.9293904646952322,
         
     | 
| 303 | 
         
            +
                  "grad_norm": 0.3069966733455658,
         
     | 
| 304 | 
         
            +
                  "learning_rate": 0.00021138114571944054,
         
     | 
| 305 | 
         
            +
                  "loss": 0.0624,
         
     | 
| 306 | 
         
            +
                  "mean_token_accuracy": 0.9839604765176773,
         
     | 
| 307 | 
         
            +
                  "num_tokens": 4302324.0,
         
     | 
| 308 | 
         
             
                  "step": 800
         
     | 
| 309 | 
         
             
                },
         
     | 
| 310 | 
         
             
                {
         
     | 
| 311 | 
         
             
                  "epoch": 1.9897404948702473,
         
     | 
| 312 | 
         
            +
                  "grad_norm": 0.26080530881881714,
         
     | 
| 313 | 
         
            +
                  "learning_rate": 0.000205993075700389,
         
     | 
| 314 | 
         
            +
                  "loss": 0.0728,
         
     | 
| 315 | 
         
            +
                  "mean_token_accuracy": 0.9816776049137116,
         
     | 
| 316 | 
         
            +
                  "num_tokens": 4428521.0,
         
     | 
| 317 | 
         
             
                  "step": 825
         
     | 
| 318 | 
         
             
                },
         
     | 
| 319 | 
         
             
                {
         
     | 
| 320 | 
         
             
                  "epoch": 2.0,
         
     | 
| 321 | 
         
            +
                  "eval_loss": 0.07739538699388504,
         
     | 
| 322 | 
         
            +
                  "eval_mean_token_accuracy": 0.9806474750106399,
         
     | 
| 323 | 
         
             
                  "eval_num_tokens": 4447026.0,
         
     | 
| 324 | 
         
            +
                  "eval_runtime": 60.6735,
         
     | 
| 325 | 
         
            +
                  "eval_samples_per_second": 6.082,
         
     | 
| 326 | 
         
            +
                  "eval_steps_per_second": 3.049,
         
     | 
| 327 | 
         
             
                  "step": 830
         
     | 
| 328 | 
         
             
                }
         
     | 
| 329 | 
         
             
              ],
         
     | 
| 330 | 
         
             
              "logging_steps": 25,
         
     | 
| 331 | 
         
            +
              "max_steps": 2075,
         
     | 
| 332 | 
         
             
              "num_input_tokens_seen": 0,
         
     | 
| 333 | 
         
            +
              "num_train_epochs": 5,
         
     | 
| 334 | 
         
             
              "save_steps": 500,
         
     | 
| 335 | 
         
             
              "stateful_callbacks": {
         
     | 
| 336 | 
         
             
                "TrainerControl": {
         
     | 
| 
         | 
|
| 344 | 
         
             
                  "attributes": {}
         
     | 
| 345 | 
         
             
                }
         
     | 
| 346 | 
         
             
              },
         
     | 
| 347 | 
         
            +
              "total_flos": 1.9232524993779302e+17,
         
     | 
| 348 | 
         
             
              "train_batch_size": 2,
         
     | 
| 349 | 
         
             
              "trial_name": null,
         
     | 
| 350 | 
         
             
              "trial_params": null
         
     | 
    	
        checkpoint-830/training_args.bin
    CHANGED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 6033
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:51ece4ed6b1462de05ca804e04b783f884883c31cae5c545b5f19f6192d34a62
         
     | 
| 3 | 
         
             
            size 6033
         
     | 
    	
        runs/Aug06_01-20-31_pan/events.out.tfevents.1754457633.pan.717279.0
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:003644eb00207d06ce8534292f943fb3b729b61b343617a64fdb4b319c927ccc
         
     | 
| 3 | 
         
            +
            size 35474
         
     | 
    	
        runs/Aug06_01-20-31_pan/events.out.tfevents.1754464590.pan.717279.1
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:e082e332d3b6af0c62c0dee5fa387dfefca7b066e14a0e1b6645603c077ed32a
         
     | 
| 3 | 
         
            +
            size 478
         
     | 
    	
        runs/Aug06_10-56-17_pan/events.out.tfevents.1754492237.pan.744812.0
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:f4de8c46818b9a72a780a412de993d94c89bbdc1d0f0648ba30f316c39ca6c68
         
     | 
| 3 | 
         
            +
            size 473
         
     |