pszemraj commited on
Commit
d85a90d
·
verified ·
1 Parent(s): 2e4381e

Upload folder using huggingface_hub

Browse files
checkpoints/.hydra/config.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mode: pt
2
+ device: gpu
3
+ precision: bf16
4
+ eval_only: false
5
+ predict_only: false
6
+ seed: 80085
7
+ model:
8
+ klass: local_t5
9
+ name: pszemraj/tFINE-base-65kBPE-FLAN
10
+ overwrite:
11
+ dropout_rate: 0.0
12
+ add_config:
13
+ is_bf16: false
14
+ checkpoint_path: ''
15
+ random_init: true
16
+ compile: true
17
+ tokenizer:
18
+ name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
19
+ data:
20
+ input_length: 1024
21
+ mlm_probability: 0.15
22
+ mean_noise_span_length: 3.0
23
+ num_workers: 8
24
+ optim:
25
+ name: adamwscale
26
+ base_lr: 0.008
27
+ batch_size: 120
28
+ total_steps: 80000
29
+ epochs: -1
30
+ warmup_steps: 10000
31
+ lr_scheduler: cosine
32
+ weight_decay: 0.0001
33
+ grad_clip: 1.0
34
+ grad_acc: 24
35
+ final_cosine: 1.0e-05
36
+ eval:
37
+ every_steps: 100000
38
+ steps: 500
39
+ checkpoint:
40
+ every_steps: 5000
41
+ logging:
42
+ neptune: false
43
+ neptune_creds:
44
+ project: null
45
+ api_token: null
46
+ tags: ''
47
+ every_steps: 50
48
+ grad_l2: true
49
+ weights_l2: true
checkpoints/.hydra/hydra.yaml ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}-${logging.neptune_creds.tags}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: main
117
+ chdir: true
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: default
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.1'
132
+ cwd: /workspace/nanoT5
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /workspace/nanoT5/nanoT5/configs
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /workspace/nanoT5/logs/2024-08-09/08-30-29-
144
+ choices:
145
+ local_env: default
146
+ task: pt
147
+ hydra/env: default
148
+ hydra/callbacks: null
149
+ hydra/job_logging: default
150
+ hydra/hydra_logging: default
151
+ hydra/hydra_help: default
152
+ hydra/help: default
153
+ hydra/sweeper: basic
154
+ hydra/launcher: basic
155
+ hydra/output: default
156
+ verbose: false
checkpoints/.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
checkpoints/checkpoint-pt-10000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f49fbb3e36013bc83be014866e4f85ff9bb2334b4c0c1154d411c64e5324b19
3
+ size 1202681712
checkpoints/checkpoint-pt-10000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:634ae87ad9ec14553a807f970f4e595e3fef7b62fd4afaddf671a76426ff94ed
3
+ size 14344
checkpoints/checkpoint-pt-5000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60b2bfff7bec1e5e4cb66aa287758e4728355ecefceca151a67ec45441547613
3
+ size 1202681712
checkpoints/checkpoint-pt-5000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:634ae87ad9ec14553a807f970f4e595e3fef7b62fd4afaddf671a76426ff94ed
3
+ size 14344
checkpoints/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pszemraj/tFINE-base-65kBPE-FLAN",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2560,
8
+ "d_kv": 64,
9
+ "d_model": 768,
10
+ "decoder_start_token_id": 3,
11
+ "dense_act_fn": "silu",
12
+ "dropout_rate": 0.0,
13
+ "eos_token_id": 2,
14
+ "feed_forward_proj": "gated-silu",
15
+ "initializer_factor": 1.0,
16
+ "is_bf16": true,
17
+ "is_encoder_decoder": false,
18
+ "is_gated_act": true,
19
+ "layer_norm_epsilon": 1e-06,
20
+ "model_type": "t5",
21
+ "num_decoder_layers": 12,
22
+ "num_heads": 12,
23
+ "num_layers": 12,
24
+ "output_past": true,
25
+ "pad_token_id": 3,
26
+ "relative_attention_max_distance": 128,
27
+ "relative_attention_num_buckets": 32,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.44.0",
31
+ "use_cache": true,
32
+ "vocab_size": 48256
33
+ }
checkpoints/main.log ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2024-08-09 08:30:30,106][Main][INFO] - Distributed environment: NO
2
+ Num processes: 1
3
+ Process index: 0
4
+ Local process index: 0
5
+ Device: cuda
6
+
7
+ Mixed precision type: bf16
8
+
9
+ [2024-08-09 08:30:30,106][Main][INFO] - Working directory is /workspace/nanoT5/logs/2024-08-09/08-30-29-
10
+ [2024-08-09 08:38:01,730][Main][INFO] - [train] Step 50 out of 80000 | Loss --> 60.113 | Grad_l2 --> 186.709 | Weights_l2 --> 8624.587 | Lr --> 0.004 | Seconds_per_step --> 8.363 |
11
+ [2024-08-09 08:42:09,928][Main][INFO] - [train] Step 100 out of 80000 | Loss --> 22.120 | Grad_l2 --> 47.074 | Weights_l2 --> 8624.166 | Lr --> 0.004 | Seconds_per_step --> 4.964 |
12
+ [2024-08-09 08:46:13,808][Main][INFO] - [train] Step 150 out of 80000 | Loss --> 12.856 | Grad_l2 --> 28.865 | Weights_l2 --> 8623.587 | Lr --> 0.004 | Seconds_per_step --> 4.878 |
13
+ [2024-08-09 08:50:08,941][Main][INFO] - [train] Step 200 out of 80000 | Loss --> 10.357 | Grad_l2 --> 30.528 | Weights_l2 --> 8623.073 | Lr --> 0.004 | Seconds_per_step --> 4.703 |
14
+ [2024-08-09 08:54:06,924][Main][INFO] - [train] Step 250 out of 80000 | Loss --> 8.792 | Grad_l2 --> 17.202 | Weights_l2 --> 8622.533 | Lr --> 0.004 | Seconds_per_step --> 4.760 |
15
+ [2024-08-09 08:58:12,688][Main][INFO] - [train] Step 300 out of 80000 | Loss --> 7.720 | Grad_l2 --> 12.189 | Weights_l2 --> 8622.034 | Lr --> 0.004 | Seconds_per_step --> 4.915 |
16
+ [2024-08-09 09:02:09,434][Main][INFO] - [train] Step 350 out of 80000 | Loss --> 7.276 | Grad_l2 --> 10.214 | Weights_l2 --> 8621.544 | Lr --> 0.004 | Seconds_per_step --> 4.735 |
17
+ [2024-08-09 09:06:02,511][Main][INFO] - [train] Step 400 out of 80000 | Loss --> 7.054 | Grad_l2 --> 10.111 | Weights_l2 --> 8621.091 | Lr --> 0.004 | Seconds_per_step --> 4.662 |
18
+ [2024-08-09 09:10:08,058][Main][INFO] - [train] Step 450 out of 80000 | Loss --> 6.941 | Grad_l2 --> 9.960 | Weights_l2 --> 8620.672 | Lr --> 0.004 | Seconds_per_step --> 4.911 |
19
+ [2024-08-09 09:14:09,465][Main][INFO] - [train] Step 500 out of 80000 | Loss --> 6.777 | Grad_l2 --> 9.558 | Weights_l2 --> 8620.252 | Lr --> 0.004 | Seconds_per_step --> 4.828 |
20
+ [2024-08-09 09:17:58,397][Main][INFO] - [train] Step 550 out of 80000 | Loss --> 6.730 | Grad_l2 --> 9.024 | Weights_l2 --> 8619.864 | Lr --> 0.004 | Seconds_per_step --> 4.579 |
21
+ [2024-08-09 09:21:50,846][Main][INFO] - [train] Step 600 out of 80000 | Loss --> 6.626 | Grad_l2 --> 7.926 | Weights_l2 --> 8619.457 | Lr --> 0.004 | Seconds_per_step --> 4.649 |
22
+ [2024-08-09 09:25:58,874][Main][INFO] - [train] Step 650 out of 80000 | Loss --> 6.504 | Grad_l2 --> 6.422 | Weights_l2 --> 8619.040 | Lr --> 0.004 | Seconds_per_step --> 4.961 |
23
+ [2024-08-09 09:29:54,562][Main][INFO] - [train] Step 700 out of 80000 | Loss --> 6.425 | Grad_l2 --> 6.909 | Weights_l2 --> 8618.645 | Lr --> 0.004 | Seconds_per_step --> 4.714 |
24
+ [2024-08-09 09:33:50,054][Main][INFO] - [train] Step 750 out of 80000 | Loss --> 6.413 | Grad_l2 --> 6.699 | Weights_l2 --> 8618.254 | Lr --> 0.004 | Seconds_per_step --> 4.710 |
25
+ [2024-08-09 09:37:48,669][Main][INFO] - [train] Step 800 out of 80000 | Loss --> 6.339 | Grad_l2 --> 4.883 | Weights_l2 --> 8617.828 | Lr --> 0.004 | Seconds_per_step --> 4.772 |
26
+ [2024-08-09 09:41:53,765][Main][INFO] - [train] Step 850 out of 80000 | Loss --> 6.305 | Grad_l2 --> 5.402 | Weights_l2 --> 8617.423 | Lr --> 0.004 | Seconds_per_step --> 4.902 |
27
+ [2024-08-09 09:45:52,215][Main][INFO] - [train] Step 900 out of 80000 | Loss --> 6.254 | Grad_l2 --> 5.631 | Weights_l2 --> 8617.040 | Lr --> 0.004 | Seconds_per_step --> 4.769 |
28
+ [2024-08-09 09:49:47,148][Main][INFO] - [train] Step 950 out of 80000 | Loss --> 6.232 | Grad_l2 --> 5.005 | Weights_l2 --> 8616.646 | Lr --> 0.004 | Seconds_per_step --> 4.699 |
29
+ [2024-08-09 09:53:46,382][Main][INFO] - [train] Step 1000 out of 80000 | Loss --> 6.170 | Grad_l2 --> 5.456 | Weights_l2 --> 8616.274 | Lr --> 0.004 | Seconds_per_step --> 4.785 |
30
+ [2024-08-09 09:57:42,782][Main][INFO] - [train] Step 1050 out of 80000 | Loss --> 6.163 | Grad_l2 --> 3.954 | Weights_l2 --> 8615.859 | Lr --> 0.004 | Seconds_per_step --> 4.728 |
31
+ [2024-08-09 10:01:39,784][Main][INFO] - [train] Step 1100 out of 80000 | Loss --> 6.153 | Grad_l2 --> 4.661 | Weights_l2 --> 8615.485 | Lr --> 0.004 | Seconds_per_step --> 4.740 |
32
+ [2024-08-09 10:05:37,074][Main][INFO] - [train] Step 1150 out of 80000 | Loss --> 6.120 | Grad_l2 --> 4.405 | Weights_l2 --> 8615.110 | Lr --> 0.004 | Seconds_per_step --> 4.746 |
33
+ [2024-08-09 10:09:42,375][Main][INFO] - [train] Step 1200 out of 80000 | Loss --> 6.095 | Grad_l2 --> 4.862 | Weights_l2 --> 8614.756 | Lr --> 0.004 | Seconds_per_step --> 4.906 |
34
+ [2024-08-09 10:13:44,826][Main][INFO] - [train] Step 1250 out of 80000 | Loss --> 6.065 | Grad_l2 --> 3.995 | Weights_l2 --> 8614.382 | Lr --> 0.004 | Seconds_per_step --> 4.849 |
35
+ [2024-08-09 10:17:45,169][Main][INFO] - [train] Step 1300 out of 80000 | Loss --> 5.987 | Grad_l2 --> 4.501 | Weights_l2 --> 8614.025 | Lr --> 0.005 | Seconds_per_step --> 4.807 |
36
+ [2024-08-09 10:21:46,890][Main][INFO] - [train] Step 1350 out of 80000 | Loss --> 6.011 | Grad_l2 --> 4.330 | Weights_l2 --> 8613.671 | Lr --> 0.005 | Seconds_per_step --> 4.834 |
37
+ [2024-08-09 10:25:46,445][Main][INFO] - [train] Step 1400 out of 80000 | Loss --> 5.968 | Grad_l2 --> 4.033 | Weights_l2 --> 8613.308 | Lr --> 0.005 | Seconds_per_step --> 4.791 |
38
+ [2024-08-09 10:29:35,135][Main][INFO] - [train] Step 1450 out of 80000 | Loss --> 5.965 | Grad_l2 --> 3.817 | Weights_l2 --> 8612.959 | Lr --> 0.005 | Seconds_per_step --> 4.574 |
39
+ [2024-08-09 10:33:33,627][Main][INFO] - [train] Step 1500 out of 80000 | Loss --> 5.926 | Grad_l2 --> 3.525 | Weights_l2 --> 8612.605 | Lr --> 0.005 | Seconds_per_step --> 4.770 |
40
+ [2024-08-09 10:37:31,600][Main][INFO] - [train] Step 1550 out of 80000 | Loss --> 5.908 | Grad_l2 --> 3.178 | Weights_l2 --> 8612.265 | Lr --> 0.005 | Seconds_per_step --> 4.759 |
41
+ [2024-08-09 10:41:26,179][Main][INFO] - [train] Step 1600 out of 80000 | Loss --> 5.878 | Grad_l2 --> 3.430 | Weights_l2 --> 8611.930 | Lr --> 0.005 | Seconds_per_step --> 4.692 |
42
+ [2024-08-09 10:45:17,990][Main][INFO] - [train] Step 1650 out of 80000 | Loss --> 5.864 | Grad_l2 --> 3.399 | Weights_l2 --> 8611.598 | Lr --> 0.005 | Seconds_per_step --> 4.636 |
43
+ [2024-08-09 10:49:16,915][Main][INFO] - [train] Step 1700 out of 80000 | Loss --> 5.845 | Grad_l2 --> 3.266 | Weights_l2 --> 8611.279 | Lr --> 0.005 | Seconds_per_step --> 4.778 |
44
+ [2024-08-09 10:53:22,739][Main][INFO] - [train] Step 1750 out of 80000 | Loss --> 5.815 | Grad_l2 --> 3.539 | Weights_l2 --> 8610.973 | Lr --> 0.005 | Seconds_per_step --> 4.916 |
45
+ [2024-08-09 10:57:15,819][Main][INFO] - [train] Step 1800 out of 80000 | Loss --> 5.813 | Grad_l2 --> 3.014 | Weights_l2 --> 8610.660 | Lr --> 0.005 | Seconds_per_step --> 4.662 |
46
+ [2024-08-09 11:01:07,812][Main][INFO] - [train] Step 1850 out of 80000 | Loss --> 5.781 | Grad_l2 --> 3.157 | Weights_l2 --> 8610.357 | Lr --> 0.005 | Seconds_per_step --> 4.640 |
47
+ [2024-08-09 11:05:06,130][Main][INFO] - [train] Step 1900 out of 80000 | Loss --> 5.781 | Grad_l2 --> 2.876 | Weights_l2 --> 8610.069 | Lr --> 0.005 | Seconds_per_step --> 4.766 |
48
+ [2024-08-09 11:09:10,053][Main][INFO] - [train] Step 1950 out of 80000 | Loss --> 5.727 | Grad_l2 --> 3.171 | Weights_l2 --> 8609.783 | Lr --> 0.005 | Seconds_per_step --> 4.878 |
49
+ [2024-08-09 11:13:04,823][Main][INFO] - [train] Step 2000 out of 80000 | Loss --> 5.701 | Grad_l2 --> 3.384 | Weights_l2 --> 8609.494 | Lr --> 0.005 | Seconds_per_step --> 4.695 |
50
+ [2024-08-09 11:16:58,015][Main][INFO] - [train] Step 2050 out of 80000 | Loss --> 5.706 | Grad_l2 --> 2.739 | Weights_l2 --> 8609.191 | Lr --> 0.005 | Seconds_per_step --> 4.664 |
51
+ [2024-08-09 11:21:09,220][Main][INFO] - [train] Step 2100 out of 80000 | Loss --> 5.697 | Grad_l2 --> 2.753 | Weights_l2 --> 8608.924 | Lr --> 0.005 | Seconds_per_step --> 5.024 |
52
+ [2024-08-09 11:24:59,988][Main][INFO] - [train] Step 2150 out of 80000 | Loss --> 5.679 | Grad_l2 --> 2.713 | Weights_l2 --> 8608.657 | Lr --> 0.005 | Seconds_per_step --> 4.615 |
53
+ [2024-08-09 11:28:50,211][Main][INFO] - [train] Step 2200 out of 80000 | Loss --> 5.659 | Grad_l2 --> 2.789 | Weights_l2 --> 8608.401 | Lr --> 0.005 | Seconds_per_step --> 4.604 |
54
+ [2024-08-09 11:32:47,428][Main][INFO] - [train] Step 2250 out of 80000 | Loss --> 5.643 | Grad_l2 --> 3.085 | Weights_l2 --> 8608.150 | Lr --> 0.005 | Seconds_per_step --> 4.744 |
55
+ [2024-08-09 11:36:52,444][Main][INFO] - [train] Step 2300 out of 80000 | Loss --> 5.606 | Grad_l2 --> 3.170 | Weights_l2 --> 8607.880 | Lr --> 0.005 | Seconds_per_step --> 4.900 |
56
+ [2024-08-09 11:40:40,829][Main][INFO] - [train] Step 2350 out of 80000 | Loss --> 5.585 | Grad_l2 --> 2.834 | Weights_l2 --> 8607.632 | Lr --> 0.005 | Seconds_per_step --> 4.568 |
57
+ [2024-08-09 11:44:35,220][Main][INFO] - [train] Step 2400 out of 80000 | Loss --> 5.595 | Grad_l2 --> 2.603 | Weights_l2 --> 8607.391 | Lr --> 0.005 | Seconds_per_step --> 4.688 |
58
+ [2024-08-09 11:47:52,825][Main][INFO] - [train] Step 2450 out of 80000 | Loss --> 5.571 | Grad_l2 --> 2.616 | Weights_l2 --> 8607.146 | Lr --> 0.005 | Seconds_per_step --> 3.952 |
59
+ [2024-08-09 11:50:42,712][Main][INFO] - [train] Step 2500 out of 80000 | Loss --> 5.588 | Grad_l2 --> 2.392 | Weights_l2 --> 8606.913 | Lr --> 0.005 | Seconds_per_step --> 3.398 |
60
+ [2024-08-09 11:54:19,840][Main][INFO] - [train] Step 2550 out of 80000 | Loss --> 5.598 | Grad_l2 --> 3.058 | Weights_l2 --> 8606.708 | Lr --> 0.005 | Seconds_per_step --> 4.343 |
61
+ [2024-08-09 11:58:07,896][Main][INFO] - [train] Step 2600 out of 80000 | Loss --> 5.554 | Grad_l2 --> 2.508 | Weights_l2 --> 8606.498 | Lr --> 0.005 | Seconds_per_step --> 4.561 |
62
+ [2024-08-09 12:02:07,989][Main][INFO] - [train] Step 2650 out of 80000 | Loss --> 5.536 | Grad_l2 --> 2.317 | Weights_l2 --> 8606.300 | Lr --> 0.005 | Seconds_per_step --> 4.802 |
63
+ [2024-08-09 12:06:22,355][Main][INFO] - [train] Step 2700 out of 80000 | Loss --> 5.533 | Grad_l2 --> 2.347 | Weights_l2 --> 8606.121 | Lr --> 0.005 | Seconds_per_step --> 5.087 |
64
+ [2024-08-09 12:10:05,296][Main][INFO] - [train] Step 2750 out of 80000 | Loss --> 5.502 | Grad_l2 --> 2.522 | Weights_l2 --> 8605.932 | Lr --> 0.005 | Seconds_per_step --> 4.459 |
65
+ [2024-08-09 12:13:56,942][Main][INFO] - [train] Step 2800 out of 80000 | Loss --> 5.484 | Grad_l2 --> 2.503 | Weights_l2 --> 8605.729 | Lr --> 0.005 | Seconds_per_step --> 4.633 |
66
+ [2024-08-09 12:17:56,310][Main][INFO] - [train] Step 2850 out of 80000 | Loss --> 5.471 | Grad_l2 --> 2.559 | Weights_l2 --> 8605.524 | Lr --> 0.005 | Seconds_per_step --> 4.787 |
67
+ [2024-08-09 12:21:50,249][Main][INFO] - [train] Step 2900 out of 80000 | Loss --> 5.463 | Grad_l2 --> 2.446 | Weights_l2 --> 8605.344 | Lr --> 0.005 | Seconds_per_step --> 4.679 |
68
+ [2024-08-09 12:25:43,300][Main][INFO] - [train] Step 2950 out of 80000 | Loss --> 5.481 | Grad_l2 --> 2.152 | Weights_l2 --> 8605.182 | Lr --> 0.005 | Seconds_per_step --> 4.661 |
69
+ [2024-08-09 12:29:34,779][Main][INFO] - [train] Step 3000 out of 80000 | Loss --> 5.444 | Grad_l2 --> 2.267 | Weights_l2 --> 8605.025 | Lr --> 0.005 | Seconds_per_step --> 4.630 |
70
+ [2024-08-09 12:33:43,889][Main][INFO] - [train] Step 3050 out of 80000 | Loss --> 5.445 | Grad_l2 --> 2.029 | Weights_l2 --> 8604.870 | Lr --> 0.005 | Seconds_per_step --> 4.982 |
71
+ [2024-08-09 12:37:33,552][Main][INFO] - [train] Step 3100 out of 80000 | Loss --> 5.439 | Grad_l2 --> 2.249 | Weights_l2 --> 8604.734 | Lr --> 0.005 | Seconds_per_step --> 4.593 |
72
+ [2024-08-09 12:41:33,458][Main][INFO] - [train] Step 3150 out of 80000 | Loss --> 5.390 | Grad_l2 --> 2.281 | Weights_l2 --> 8604.574 | Lr --> 0.005 | Seconds_per_step --> 4.798 |
73
+ [2024-08-09 12:45:28,169][Main][INFO] - [train] Step 3200 out of 80000 | Loss --> 5.395 | Grad_l2 --> 2.124 | Weights_l2 --> 8604.424 | Lr --> 0.005 | Seconds_per_step --> 4.694 |
74
+ [2024-08-09 12:49:31,716][Main][INFO] - [train] Step 3250 out of 80000 | Loss --> 5.381 | Grad_l2 --> 2.379 | Weights_l2 --> 8604.286 | Lr --> 0.005 | Seconds_per_step --> 4.871 |
75
+ [2024-08-09 12:53:26,686][Main][INFO] - [train] Step 3300 out of 80000 | Loss --> 5.365 | Grad_l2 --> 2.335 | Weights_l2 --> 8604.130 | Lr --> 0.005 | Seconds_per_step --> 4.699 |
76
+ [2024-08-09 12:57:18,564][Main][INFO] - [train] Step 3350 out of 80000 | Loss --> 5.365 | Grad_l2 --> 2.185 | Weights_l2 --> 8603.989 | Lr --> 0.005 | Seconds_per_step --> 4.638 |
77
+ [2024-08-09 13:01:23,837][Main][INFO] - [train] Step 3400 out of 80000 | Loss --> 5.347 | Grad_l2 --> 2.330 | Weights_l2 --> 8603.845 | Lr --> 0.005 | Seconds_per_step --> 4.905 |
78
+ [2024-08-09 13:05:16,575][Main][INFO] - [train] Step 3450 out of 80000 | Loss --> 5.349 | Grad_l2 --> 1.951 | Weights_l2 --> 8603.727 | Lr --> 0.005 | Seconds_per_step --> 4.655 |
79
+ [2024-08-09 13:08:27,542][Main][INFO] - [train] Step 3500 out of 80000 | Loss --> 5.356 | Grad_l2 --> 1.986 | Weights_l2 --> 8603.662 | Lr --> 0.005 | Seconds_per_step --> 3.819 |
80
+ [2024-08-09 13:12:30,541][Main][INFO] - [train] Step 3550 out of 80000 | Loss --> 5.312 | Grad_l2 --> 2.396 | Weights_l2 --> 8603.545 | Lr --> 0.005 | Seconds_per_step --> 4.860 |
81
+ [2024-08-09 13:16:49,213][Main][INFO] - [train] Step 3600 out of 80000 | Loss --> 5.299 | Grad_l2 --> 2.230 | Weights_l2 --> 8603.411 | Lr --> 0.005 | Seconds_per_step --> 5.173 |
82
+ [2024-08-09 13:20:53,058][Main][INFO] - [train] Step 3650 out of 80000 | Loss --> 5.307 | Grad_l2 --> 2.386 | Weights_l2 --> 8603.284 | Lr --> 0.005 | Seconds_per_step --> 4.877 |
83
+ [2024-08-09 13:24:44,487][Main][INFO] - [train] Step 3700 out of 80000 | Loss --> 5.293 | Grad_l2 --> 2.071 | Weights_l2 --> 8603.169 | Lr --> 0.005 | Seconds_per_step --> 4.629 |
84
+ [2024-08-09 13:28:47,607][Main][INFO] - [train] Step 3750 out of 80000 | Loss --> 5.298 | Grad_l2 --> 2.199 | Weights_l2 --> 8603.065 | Lr --> 0.005 | Seconds_per_step --> 4.862 |
85
+ [2024-08-09 13:32:52,512][Main][INFO] - [train] Step 3800 out of 80000 | Loss --> 5.277 | Grad_l2 --> 2.091 | Weights_l2 --> 8602.962 | Lr --> 0.006 | Seconds_per_step --> 4.898 |
86
+ [2024-08-09 13:36:42,719][Main][INFO] - [train] Step 3850 out of 80000 | Loss --> 5.284 | Grad_l2 --> 2.042 | Weights_l2 --> 8602.881 | Lr --> 0.006 | Seconds_per_step --> 4.604 |
87
+ [2024-08-09 13:40:34,318][Main][INFO] - [train] Step 3900 out of 80000 | Loss --> 5.245 | Grad_l2 --> 2.240 | Weights_l2 --> 8602.781 | Lr --> 0.006 | Seconds_per_step --> 4.632 |
88
+ [2024-08-09 13:44:45,754][Main][INFO] - [train] Step 3950 out of 80000 | Loss --> 5.245 | Grad_l2 --> 1.955 | Weights_l2 --> 8602.686 | Lr --> 0.006 | Seconds_per_step --> 5.029 |
89
+ [2024-08-09 13:48:39,099][Main][INFO] - [train] Step 4000 out of 80000 | Loss --> 5.257 | Grad_l2 --> 2.011 | Weights_l2 --> 8602.644 | Lr --> 0.006 | Seconds_per_step --> 4.667 |
90
+ [2024-08-09 13:52:31,353][Main][INFO] - [train] Step 4050 out of 80000 | Loss --> 5.239 | Grad_l2 --> 1.838 | Weights_l2 --> 8602.573 | Lr --> 0.006 | Seconds_per_step --> 4.645 |
91
+ [2024-08-09 13:56:29,186][Main][INFO] - [train] Step 4100 out of 80000 | Loss --> 5.238 | Grad_l2 --> 1.935 | Weights_l2 --> 8602.540 | Lr --> 0.006 | Seconds_per_step --> 4.757 |
92
+ [2024-08-09 14:00:27,682][Main][INFO] - [train] Step 4150 out of 80000 | Loss --> 5.211 | Grad_l2 --> 2.014 | Weights_l2 --> 8602.468 | Lr --> 0.006 | Seconds_per_step --> 4.770 |
93
+ [2024-08-09 14:04:26,879][Main][INFO] - [train] Step 4200 out of 80000 | Loss --> 5.202 | Grad_l2 --> 2.106 | Weights_l2 --> 8602.418 | Lr --> 0.006 | Seconds_per_step --> 4.784 |
94
+ [2024-08-09 14:08:26,097][Main][INFO] - [train] Step 4250 out of 80000 | Loss --> 5.194 | Grad_l2 --> 1.876 | Weights_l2 --> 8602.330 | Lr --> 0.006 | Seconds_per_step --> 4.784 |
95
+ [2024-08-09 14:12:43,883][Main][INFO] - [train] Step 4300 out of 80000 | Loss --> 5.216 | Grad_l2 --> 1.692 | Weights_l2 --> 8602.339 | Lr --> 0.006 | Seconds_per_step --> 5.156 |
96
+ [2024-08-09 14:16:59,892][Main][INFO] - [train] Step 4350 out of 80000 | Loss --> 5.195 | Grad_l2 --> 1.824 | Weights_l2 --> 8602.342 | Lr --> 0.006 | Seconds_per_step --> 5.120 |
97
+ [2024-08-09 14:20:57,072][Main][INFO] - [train] Step 4400 out of 80000 | Loss --> 5.193 | Grad_l2 --> 1.640 | Weights_l2 --> 8602.351 | Lr --> 0.006 | Seconds_per_step --> 4.744 |
98
+ [2024-08-09 14:25:01,683][Main][INFO] - [train] Step 4450 out of 80000 | Loss --> 5.186 | Grad_l2 --> 1.790 | Weights_l2 --> 8602.369 | Lr --> 0.006 | Seconds_per_step --> 4.892 |
99
+ [2024-08-09 14:29:08,638][Main][INFO] - [train] Step 4500 out of 80000 | Loss --> 5.162 | Grad_l2 --> 1.890 | Weights_l2 --> 8602.364 | Lr --> 0.006 | Seconds_per_step --> 4.939 |
100
+ [2024-08-09 14:32:58,390][Main][INFO] - [train] Step 4550 out of 80000 | Loss --> 5.136 | Grad_l2 --> 1.776 | Weights_l2 --> 8602.345 | Lr --> 0.006 | Seconds_per_step --> 4.595 |
101
+ [2024-08-09 14:37:00,248][Main][INFO] - [train] Step 4600 out of 80000 | Loss --> 5.135 | Grad_l2 --> 1.661 | Weights_l2 --> 8602.366 | Lr --> 0.006 | Seconds_per_step --> 4.837 |
102
+ [2024-08-09 14:41:11,560][Main][INFO] - [train] Step 4650 out of 80000 | Loss --> 5.139 | Grad_l2 --> 1.623 | Weights_l2 --> 8602.434 | Lr --> 0.006 | Seconds_per_step --> 5.026 |
103
+ [2024-08-09 14:45:14,951][Main][INFO] - [train] Step 4700 out of 80000 | Loss --> 5.090 | Grad_l2 --> 1.703 | Weights_l2 --> 8602.491 | Lr --> 0.006 | Seconds_per_step --> 4.868 |
104
+ [2024-08-09 14:49:09,655][Main][INFO] - [train] Step 4750 out of 80000 | Loss --> 5.056 | Grad_l2 --> 1.918 | Weights_l2 --> 8602.542 | Lr --> 0.006 | Seconds_per_step --> 4.694 |
105
+ [2024-08-09 14:53:11,228][Main][INFO] - [train] Step 4800 out of 80000 | Loss --> 5.018 | Grad_l2 --> 1.805 | Weights_l2 --> 8602.552 | Lr --> 0.006 | Seconds_per_step --> 4.831 |
106
+ [2024-08-09 14:57:15,004][Main][INFO] - [train] Step 4850 out of 80000 | Loss --> 5.016 | Grad_l2 --> 1.660 | Weights_l2 --> 8602.639 | Lr --> 0.006 | Seconds_per_step --> 4.876 |
107
+ [2024-08-09 15:01:09,698][Main][INFO] - [train] Step 4900 out of 80000 | Loss --> 4.994 | Grad_l2 --> 1.595 | Weights_l2 --> 8602.806 | Lr --> 0.006 | Seconds_per_step --> 4.694 |
108
+ [2024-08-09 15:04:01,695][Main][INFO] - [train] Step 4950 out of 80000 | Loss --> 4.946 | Grad_l2 --> 1.783 | Weights_l2 --> 8602.949 | Lr --> 0.006 | Seconds_per_step --> 3.440 |
109
+ [2024-08-09 15:07:39,946][Main][INFO] - [train] Step 5000 out of 80000 | Loss --> 4.722 | Grad_l2 --> 1.590 | Weights_l2 --> 8603.165 | Lr --> 0.006 | Seconds_per_step --> 4.365 |
110
+ [2024-08-09 15:07:39,947][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-5000
111
+ [2024-08-09 15:07:39,951][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
112
+ [2024-08-09 15:07:46,022][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-5000/model.safetensors
113
+ [2024-08-09 15:07:49,438][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-5000/optimizer.bin
114
+ [2024-08-09 15:07:49,439][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-5000/scheduler.bin
115
+ [2024-08-09 15:07:49,439][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-5000/sampler.bin
116
+ [2024-08-09 15:07:49,439][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-5000/sampler_1.bin
117
+ [2024-08-09 15:07:49,440][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-5000/random_states_0.pkl
118
+ [2024-08-09 15:11:55,741][Main][INFO] - [train] Step 5050 out of 80000 | Loss --> 4.582 | Grad_l2 --> 1.679 | Weights_l2 --> 8603.473 | Lr --> 0.006 | Seconds_per_step --> 5.116 |
119
+ [2024-08-09 15:15:46,314][Main][INFO] - [train] Step 5100 out of 80000 | Loss --> 4.472 | Grad_l2 --> 1.636 | Weights_l2 --> 8603.746 | Lr --> 0.006 | Seconds_per_step --> 4.611 |
120
+ [2024-08-09 15:19:45,374][Main][INFO] - [train] Step 5150 out of 80000 | Loss --> 4.370 | Grad_l2 --> 1.523 | Weights_l2 --> 8604.092 | Lr --> 0.006 | Seconds_per_step --> 4.781 |
121
+ [2024-08-09 15:23:51,223][Main][INFO] - [train] Step 5200 out of 80000 | Loss --> 4.267 | Grad_l2 --> 1.542 | Weights_l2 --> 8604.440 | Lr --> 0.006 | Seconds_per_step --> 4.917 |
122
+ [2024-08-09 15:27:51,655][Main][INFO] - [train] Step 5250 out of 80000 | Loss --> 4.191 | Grad_l2 --> 1.477 | Weights_l2 --> 8604.872 | Lr --> 0.006 | Seconds_per_step --> 4.809 |
123
+ [2024-08-09 15:31:44,251][Main][INFO] - [train] Step 5300 out of 80000 | Loss --> 4.128 | Grad_l2 --> 1.490 | Weights_l2 --> 8605.306 | Lr --> 0.006 | Seconds_per_step --> 4.652 |
124
+ [2024-08-09 15:35:40,470][Main][INFO] - [train] Step 5350 out of 80000 | Loss --> 4.067 | Grad_l2 --> 1.397 | Weights_l2 --> 8605.776 | Lr --> 0.006 | Seconds_per_step --> 4.724 |
125
+ [2024-08-09 15:39:48,973][Main][INFO] - [train] Step 5400 out of 80000 | Loss --> 4.015 | Grad_l2 --> 1.239 | Weights_l2 --> 8606.428 | Lr --> 0.006 | Seconds_per_step --> 4.970 |
126
+ [2024-08-09 15:43:39,070][Main][INFO] - [train] Step 5450 out of 80000 | Loss --> 3.968 | Grad_l2 --> 1.219 | Weights_l2 --> 8607.147 | Lr --> 0.006 | Seconds_per_step --> 4.602 |
127
+ [2024-08-09 15:47:34,049][Main][INFO] - [train] Step 5500 out of 80000 | Loss --> 3.903 | Grad_l2 --> 1.203 | Weights_l2 --> 8607.924 | Lr --> 0.006 | Seconds_per_step --> 4.700 |
128
+ [2024-08-09 15:51:38,499][Main][INFO] - [train] Step 5550 out of 80000 | Loss --> 3.855 | Grad_l2 --> 1.167 | Weights_l2 --> 8608.720 | Lr --> 0.006 | Seconds_per_step --> 4.889 |
129
+ [2024-08-09 15:55:46,120][Main][INFO] - [train] Step 5600 out of 80000 | Loss --> 3.815 | Grad_l2 --> 1.111 | Weights_l2 --> 8609.615 | Lr --> 0.006 | Seconds_per_step --> 4.952 |
130
+ [2024-08-09 15:59:40,828][Main][INFO] - [train] Step 5650 out of 80000 | Loss --> 3.768 | Grad_l2 --> 1.066 | Weights_l2 --> 8610.530 | Lr --> 0.006 | Seconds_per_step --> 4.694 |
131
+ [2024-08-09 16:03:38,938][Main][INFO] - [train] Step 5700 out of 80000 | Loss --> 3.711 | Grad_l2 --> 1.048 | Weights_l2 --> 8611.436 | Lr --> 0.006 | Seconds_per_step --> 4.762 |
132
+ [2024-08-09 16:07:49,871][Main][INFO] - [train] Step 5750 out of 80000 | Loss --> 3.675 | Grad_l2 --> 0.998 | Weights_l2 --> 8612.404 | Lr --> 0.006 | Seconds_per_step --> 5.019 |
133
+ [2024-08-09 16:11:53,420][Main][INFO] - [train] Step 5800 out of 80000 | Loss --> 3.625 | Grad_l2 --> 0.993 | Weights_l2 --> 8613.329 | Lr --> 0.006 | Seconds_per_step --> 4.871 |
134
+ [2024-08-09 16:15:50,534][Main][INFO] - [train] Step 5850 out of 80000 | Loss --> 3.580 | Grad_l2 --> 0.952 | Weights_l2 --> 8614.289 | Lr --> 0.006 | Seconds_per_step --> 4.742 |
135
+ [2024-08-09 16:19:45,983][Main][INFO] - [train] Step 5900 out of 80000 | Loss --> 3.545 | Grad_l2 --> 1.014 | Weights_l2 --> 8615.197 | Lr --> 0.006 | Seconds_per_step --> 4.709 |
136
+ [2024-08-09 16:23:51,342][Main][INFO] - [train] Step 5950 out of 80000 | Loss --> 3.522 | Grad_l2 --> 0.927 | Weights_l2 --> 8616.137 | Lr --> 0.006 | Seconds_per_step --> 4.907 |
137
+ [2024-08-09 16:27:42,121][Main][INFO] - [train] Step 6000 out of 80000 | Loss --> 3.483 | Grad_l2 --> 0.926 | Weights_l2 --> 8617.066 | Lr --> 0.006 | Seconds_per_step --> 4.616 |
138
+ [2024-08-09 16:31:41,278][Main][INFO] - [train] Step 6050 out of 80000 | Loss --> 3.455 | Grad_l2 --> 0.886 | Weights_l2 --> 8617.977 | Lr --> 0.006 | Seconds_per_step --> 4.783 |
139
+ [2024-08-09 16:35:47,786][Main][INFO] - [train] Step 6100 out of 80000 | Loss --> 3.428 | Grad_l2 --> 0.956 | Weights_l2 --> 8618.840 | Lr --> 0.006 | Seconds_per_step --> 4.930 |
140
+ [2024-08-09 16:39:45,096][Main][INFO] - [train] Step 6150 out of 80000 | Loss --> 3.399 | Grad_l2 --> 0.832 | Weights_l2 --> 8619.684 | Lr --> 0.006 | Seconds_per_step --> 4.746 |
141
+ [2024-08-09 16:43:41,554][Main][INFO] - [train] Step 6200 out of 80000 | Loss --> 3.377 | Grad_l2 --> 0.868 | Weights_l2 --> 8620.530 | Lr --> 0.006 | Seconds_per_step --> 4.729 |
142
+ [2024-08-09 16:47:45,442][Main][INFO] - [train] Step 6250 out of 80000 | Loss --> 3.363 | Grad_l2 --> 0.850 | Weights_l2 --> 8621.325 | Lr --> 0.006 | Seconds_per_step --> 4.878 |
143
+ [2024-08-09 16:51:50,312][Main][INFO] - [train] Step 6300 out of 80000 | Loss --> 3.332 | Grad_l2 --> 0.840 | Weights_l2 --> 8622.117 | Lr --> 0.007 | Seconds_per_step --> 4.897 |
144
+ [2024-08-09 16:55:47,619][Main][INFO] - [train] Step 6350 out of 80000 | Loss --> 3.311 | Grad_l2 --> 0.875 | Weights_l2 --> 8622.932 | Lr --> 0.007 | Seconds_per_step --> 4.746 |
145
+ [2024-08-09 16:59:44,744][Main][INFO] - [train] Step 6400 out of 80000 | Loss --> 3.289 | Grad_l2 --> 0.808 | Weights_l2 --> 8623.729 | Lr --> 0.007 | Seconds_per_step --> 4.742 |
146
+ [2024-08-09 17:03:47,092][Main][INFO] - [train] Step 6450 out of 80000 | Loss --> 3.279 | Grad_l2 --> 0.782 | Weights_l2 --> 8624.498 | Lr --> 0.007 | Seconds_per_step --> 4.847 |
147
+ [2024-08-09 17:07:51,580][Main][INFO] - [train] Step 6500 out of 80000 | Loss --> 3.250 | Grad_l2 --> 0.812 | Weights_l2 --> 8625.266 | Lr --> 0.007 | Seconds_per_step --> 4.890 |
148
+ [2024-08-09 17:11:44,444][Main][INFO] - [train] Step 6550 out of 80000 | Loss --> 3.248 | Grad_l2 --> 0.806 | Weights_l2 --> 8626.024 | Lr --> 0.007 | Seconds_per_step --> 4.657 |
149
+ [2024-08-09 17:15:43,498][Main][INFO] - [train] Step 6600 out of 80000 | Loss --> 3.216 | Grad_l2 --> 0.765 | Weights_l2 --> 8626.794 | Lr --> 0.007 | Seconds_per_step --> 4.781 |
150
+ [2024-08-09 17:19:50,311][Main][INFO] - [train] Step 6650 out of 80000 | Loss --> 3.209 | Grad_l2 --> 0.793 | Weights_l2 --> 8627.521 | Lr --> 0.007 | Seconds_per_step --> 4.936 |
151
+ [2024-08-09 17:23:54,093][Main][INFO] - [train] Step 6700 out of 80000 | Loss --> 3.200 | Grad_l2 --> 0.788 | Weights_l2 --> 8628.294 | Lr --> 0.007 | Seconds_per_step --> 4.876 |
152
+ [2024-08-09 17:27:47,402][Main][INFO] - [train] Step 6750 out of 80000 | Loss --> 3.176 | Grad_l2 --> 0.762 | Weights_l2 --> 8629.053 | Lr --> 0.007 | Seconds_per_step --> 4.666 |
153
+ [2024-08-09 17:31:49,523][Main][INFO] - [train] Step 6800 out of 80000 | Loss --> 3.170 | Grad_l2 --> 0.778 | Weights_l2 --> 8629.825 | Lr --> 0.007 | Seconds_per_step --> 4.842 |
154
+ [2024-08-09 17:35:52,826][Main][INFO] - [train] Step 6850 out of 80000 | Loss --> 3.159 | Grad_l2 --> 0.775 | Weights_l2 --> 8630.568 | Lr --> 0.007 | Seconds_per_step --> 4.866 |
155
+ [2024-08-09 17:39:46,125][Main][INFO] - [train] Step 6900 out of 80000 | Loss --> 3.158 | Grad_l2 --> 0.757 | Weights_l2 --> 8631.325 | Lr --> 0.007 | Seconds_per_step --> 4.666 |
156
+ [2024-08-09 17:43:39,817][Main][INFO] - [train] Step 6950 out of 80000 | Loss --> 3.138 | Grad_l2 --> 0.766 | Weights_l2 --> 8632.055 | Lr --> 0.007 | Seconds_per_step --> 4.674 |
157
+ [2024-08-09 17:47:44,929][Main][INFO] - [train] Step 7000 out of 80000 | Loss --> 3.123 | Grad_l2 --> 0.759 | Weights_l2 --> 8632.805 | Lr --> 0.007 | Seconds_per_step --> 4.902 |
158
+ [2024-08-09 17:51:43,866][Main][INFO] - [train] Step 7050 out of 80000 | Loss --> 3.118 | Grad_l2 --> 0.752 | Weights_l2 --> 8633.540 | Lr --> 0.007 | Seconds_per_step --> 4.779 |
159
+ [2024-08-09 17:55:42,820][Main][INFO] - [train] Step 7100 out of 80000 | Loss --> 3.103 | Grad_l2 --> 0.757 | Weights_l2 --> 8634.285 | Lr --> 0.007 | Seconds_per_step --> 4.779 |
160
+ [2024-08-09 17:59:44,322][Main][INFO] - [train] Step 7150 out of 80000 | Loss --> 3.083 | Grad_l2 --> 0.755 | Weights_l2 --> 8635.030 | Lr --> 0.007 | Seconds_per_step --> 4.830 |
161
+ [2024-08-09 18:03:44,919][Main][INFO] - [train] Step 7200 out of 80000 | Loss --> 3.073 | Grad_l2 --> 0.735 | Weights_l2 --> 8635.760 | Lr --> 0.007 | Seconds_per_step --> 4.812 |
162
+ [2024-08-09 18:07:37,774][Main][INFO] - [train] Step 7250 out of 80000 | Loss --> 3.055 | Grad_l2 --> 0.718 | Weights_l2 --> 8636.493 | Lr --> 0.007 | Seconds_per_step --> 4.657 |
163
+ [2024-08-09 18:11:34,198][Main][INFO] - [train] Step 7300 out of 80000 | Loss --> 3.051 | Grad_l2 --> 0.721 | Weights_l2 --> 8637.245 | Lr --> 0.007 | Seconds_per_step --> 4.728 |
164
+ [2024-08-09 18:15:38,927][Main][INFO] - [train] Step 7350 out of 80000 | Loss --> 3.041 | Grad_l2 --> 0.762 | Weights_l2 --> 8637.991 | Lr --> 0.007 | Seconds_per_step --> 4.895 |
165
+ [2024-08-09 18:19:42,181][Main][INFO] - [train] Step 7400 out of 80000 | Loss --> 3.031 | Grad_l2 --> 0.720 | Weights_l2 --> 8638.728 | Lr --> 0.007 | Seconds_per_step --> 4.865 |
166
+ [2024-08-09 18:23:37,911][Main][INFO] - [train] Step 7450 out of 80000 | Loss --> 3.033 | Grad_l2 --> 0.718 | Weights_l2 --> 8639.471 | Lr --> 0.007 | Seconds_per_step --> 4.715 |
167
+ [2024-08-09 18:27:38,146][Main][INFO] - [train] Step 7500 out of 80000 | Loss --> 3.020 | Grad_l2 --> 0.729 | Weights_l2 --> 8640.206 | Lr --> 0.007 | Seconds_per_step --> 4.805 |
168
+ [2024-08-09 18:31:39,590][Main][INFO] - [train] Step 7550 out of 80000 | Loss --> 3.004 | Grad_l2 --> 0.734 | Weights_l2 --> 8640.967 | Lr --> 0.007 | Seconds_per_step --> 4.829 |
169
+ [2024-08-09 18:35:32,805][Main][INFO] - [train] Step 7600 out of 80000 | Loss --> 2.986 | Grad_l2 --> 0.714 | Weights_l2 --> 8641.711 | Lr --> 0.007 | Seconds_per_step --> 4.664 |
170
+ [2024-08-09 18:39:28,080][Main][INFO] - [train] Step 7650 out of 80000 | Loss --> 2.994 | Grad_l2 --> 0.743 | Weights_l2 --> 8642.483 | Lr --> 0.007 | Seconds_per_step --> 4.705 |
171
+ [2024-08-09 18:43:37,815][Main][INFO] - [train] Step 7700 out of 80000 | Loss --> 2.980 | Grad_l2 --> 0.699 | Weights_l2 --> 8643.242 | Lr --> 0.007 | Seconds_per_step --> 4.995 |
172
+ [2024-08-09 18:47:42,799][Main][INFO] - [train] Step 7750 out of 80000 | Loss --> 2.976 | Grad_l2 --> 0.725 | Weights_l2 --> 8643.993 | Lr --> 0.007 | Seconds_per_step --> 4.900 |
173
+ [2024-08-09 18:51:34,464][Main][INFO] - [train] Step 7800 out of 80000 | Loss --> 2.963 | Grad_l2 --> 0.699 | Weights_l2 --> 8644.781 | Lr --> 0.007 | Seconds_per_step --> 4.633 |
174
+ [2024-08-09 18:55:32,534][Main][INFO] - [train] Step 7850 out of 80000 | Loss --> 2.954 | Grad_l2 --> 0.706 | Weights_l2 --> 8645.547 | Lr --> 0.007 | Seconds_per_step --> 4.761 |
175
+ [2024-08-09 18:59:39,507][Main][INFO] - [train] Step 7900 out of 80000 | Loss --> 2.947 | Grad_l2 --> 0.689 | Weights_l2 --> 8646.333 | Lr --> 0.007 | Seconds_per_step --> 4.939 |
176
+ [2024-08-09 19:03:32,747][Main][INFO] - [train] Step 7950 out of 80000 | Loss --> 2.935 | Grad_l2 --> 0.701 | Weights_l2 --> 8647.099 | Lr --> 0.007 | Seconds_per_step --> 4.665 |
177
+ [2024-08-09 19:07:42,994][Main][INFO] - [train] Step 8000 out of 80000 | Loss --> 2.940 | Grad_l2 --> 0.709 | Weights_l2 --> 8647.889 | Lr --> 0.007 | Seconds_per_step --> 5.005 |
178
+ [2024-08-09 19:11:49,930][Main][INFO] - [train] Step 8050 out of 80000 | Loss --> 2.919 | Grad_l2 --> 0.699 | Weights_l2 --> 8648.663 | Lr --> 0.007 | Seconds_per_step --> 4.939 |
179
+ [2024-08-09 19:16:03,022][Main][INFO] - [train] Step 8100 out of 80000 | Loss --> 2.916 | Grad_l2 --> 0.690 | Weights_l2 --> 8649.453 | Lr --> 0.007 | Seconds_per_step --> 5.062 |
180
+ [2024-08-09 19:20:05,203][Main][INFO] - [train] Step 8150 out of 80000 | Loss --> 2.914 | Grad_l2 --> 0.712 | Weights_l2 --> 8650.238 | Lr --> 0.007 | Seconds_per_step --> 4.844 |
181
+ [2024-08-09 19:23:57,007][Main][INFO] - [train] Step 8200 out of 80000 | Loss --> 2.903 | Grad_l2 --> 0.727 | Weights_l2 --> 8651.038 | Lr --> 0.007 | Seconds_per_step --> 4.636 |
182
+ [2024-08-09 19:28:02,052][Main][INFO] - [train] Step 8250 out of 80000 | Loss --> 2.896 | Grad_l2 --> 0.691 | Weights_l2 --> 8651.842 | Lr --> 0.007 | Seconds_per_step --> 4.901 |
183
+ [2024-08-09 19:32:01,708][Main][INFO] - [train] Step 8300 out of 80000 | Loss --> 2.889 | Grad_l2 --> 0.703 | Weights_l2 --> 8652.661 | Lr --> 0.007 | Seconds_per_step --> 4.793 |
184
+ [2024-08-09 19:35:54,542][Main][INFO] - [train] Step 8350 out of 80000 | Loss --> 2.882 | Grad_l2 --> 0.672 | Weights_l2 --> 8653.459 | Lr --> 0.007 | Seconds_per_step --> 4.657 |
185
+ [2024-08-09 19:39:53,565][Main][INFO] - [train] Step 8400 out of 80000 | Loss --> 2.861 | Grad_l2 --> 0.676 | Weights_l2 --> 8654.299 | Lr --> 0.007 | Seconds_per_step --> 4.780 |
186
+ [2024-08-09 19:43:54,929][Main][INFO] - [train] Step 8450 out of 80000 | Loss --> 2.870 | Grad_l2 --> 0.680 | Weights_l2 --> 8655.106 | Lr --> 0.007 | Seconds_per_step --> 4.827 |
187
+ [2024-08-09 19:47:46,390][Main][INFO] - [train] Step 8500 out of 80000 | Loss --> 2.857 | Grad_l2 --> 0.673 | Weights_l2 --> 8655.929 | Lr --> 0.007 | Seconds_per_step --> 4.629 |
188
+ [2024-08-09 19:51:41,774][Main][INFO] - [train] Step 8550 out of 80000 | Loss --> 2.847 | Grad_l2 --> 0.674 | Weights_l2 --> 8656.760 | Lr --> 0.007 | Seconds_per_step --> 4.708 |
189
+ [2024-08-09 19:55:50,508][Main][INFO] - [train] Step 8600 out of 80000 | Loss --> 2.838 | Grad_l2 --> 0.679 | Weights_l2 --> 8657.613 | Lr --> 0.007 | Seconds_per_step --> 4.975 |
190
+ [2024-08-09 19:59:55,899][Main][INFO] - [train] Step 8650 out of 80000 | Loss --> 2.847 | Grad_l2 --> 0.668 | Weights_l2 --> 8658.480 | Lr --> 0.007 | Seconds_per_step --> 4.908 |
191
+ [2024-08-09 20:03:46,940][Main][INFO] - [train] Step 8700 out of 80000 | Loss --> 2.834 | Grad_l2 --> 0.689 | Weights_l2 --> 8659.322 | Lr --> 0.007 | Seconds_per_step --> 4.621 |
192
+ [2024-08-09 20:07:40,599][Main][INFO] - [train] Step 8750 out of 80000 | Loss --> 2.814 | Grad_l2 --> 0.665 | Weights_l2 --> 8660.208 | Lr --> 0.007 | Seconds_per_step --> 4.673 |
193
+ [2024-08-09 20:11:45,521][Main][INFO] - [train] Step 8800 out of 80000 | Loss --> 2.817 | Grad_l2 --> 0.645 | Weights_l2 --> 8661.057 | Lr --> 0.008 | Seconds_per_step --> 4.898 |
194
+ [2024-08-09 20:15:34,178][Main][INFO] - [train] Step 8850 out of 80000 | Loss --> 2.807 | Grad_l2 --> 0.662 | Weights_l2 --> 8661.931 | Lr --> 0.008 | Seconds_per_step --> 4.573 |
195
+ [2024-08-09 20:19:09,957][Main][INFO] - [train] Step 8900 out of 80000 | Loss --> 2.806 | Grad_l2 --> 0.671 | Weights_l2 --> 8662.810 | Lr --> 0.008 | Seconds_per_step --> 4.316 |
196
+ [2024-08-09 20:22:37,497][Main][INFO] - [train] Step 8950 out of 80000 | Loss --> 2.799 | Grad_l2 --> 0.656 | Weights_l2 --> 8663.699 | Lr --> 0.008 | Seconds_per_step --> 4.151 |
197
+ [2024-08-09 20:26:01,302][Main][INFO] - [train] Step 9000 out of 80000 | Loss --> 2.796 | Grad_l2 --> 0.657 | Weights_l2 --> 8664.591 | Lr --> 0.008 | Seconds_per_step --> 4.076 |
198
+ [2024-08-09 20:29:28,057][Main][INFO] - [train] Step 9050 out of 80000 | Loss --> 2.787 | Grad_l2 --> 0.650 | Weights_l2 --> 8665.480 | Lr --> 0.008 | Seconds_per_step --> 4.135 |
199
+ [2024-08-09 20:32:55,736][Main][INFO] - [train] Step 9100 out of 80000 | Loss --> 2.771 | Grad_l2 --> 0.668 | Weights_l2 --> 8666.372 | Lr --> 0.008 | Seconds_per_step --> 4.154 |
200
+ [2024-08-09 20:36:26,470][Main][INFO] - [train] Step 9150 out of 80000 | Loss --> 2.762 | Grad_l2 --> 0.630 | Weights_l2 --> 8667.256 | Lr --> 0.008 | Seconds_per_step --> 4.215 |
201
+ [2024-08-09 20:40:02,302][Main][INFO] - [train] Step 9200 out of 80000 | Loss --> 2.764 | Grad_l2 --> 0.668 | Weights_l2 --> 8668.181 | Lr --> 0.008 | Seconds_per_step --> 4.317 |
202
+ [2024-08-09 20:43:38,319][Main][INFO] - [train] Step 9250 out of 80000 | Loss --> 2.760 | Grad_l2 --> 0.658 | Weights_l2 --> 8669.118 | Lr --> 0.008 | Seconds_per_step --> 4.320 |
203
+ [2024-08-09 20:47:12,593][Main][INFO] - [train] Step 9300 out of 80000 | Loss --> 2.754 | Grad_l2 --> 0.631 | Weights_l2 --> 8670.046 | Lr --> 0.008 | Seconds_per_step --> 4.285 |
204
+ [2024-08-09 20:50:50,547][Main][INFO] - [train] Step 9350 out of 80000 | Loss --> 2.748 | Grad_l2 --> 0.659 | Weights_l2 --> 8670.961 | Lr --> 0.008 | Seconds_per_step --> 4.359 |
205
+ [2024-08-09 20:54:27,164][Main][INFO] - [train] Step 9400 out of 80000 | Loss --> 2.745 | Grad_l2 --> 0.645 | Weights_l2 --> 8671.908 | Lr --> 0.008 | Seconds_per_step --> 4.332 |
206
+ [2024-08-09 20:57:57,318][Main][INFO] - [train] Step 9450 out of 80000 | Loss --> 2.734 | Grad_l2 --> 0.651 | Weights_l2 --> 8672.837 | Lr --> 0.008 | Seconds_per_step --> 4.203 |
207
+ [2024-08-09 21:01:27,114][Main][INFO] - [train] Step 9500 out of 80000 | Loss --> 2.724 | Grad_l2 --> 0.651 | Weights_l2 --> 8673.783 | Lr --> 0.008 | Seconds_per_step --> 4.196 |
208
+ [2024-08-09 21:05:01,540][Main][INFO] - [train] Step 9550 out of 80000 | Loss --> 2.723 | Grad_l2 --> 0.635 | Weights_l2 --> 8674.757 | Lr --> 0.008 | Seconds_per_step --> 4.289 |
209
+ [2024-08-09 21:08:31,178][Main][INFO] - [train] Step 9600 out of 80000 | Loss --> 2.707 | Grad_l2 --> 0.633 | Weights_l2 --> 8675.741 | Lr --> 0.008 | Seconds_per_step --> 4.193 |
210
+ [2024-08-09 21:12:04,549][Main][INFO] - [train] Step 9650 out of 80000 | Loss --> 2.705 | Grad_l2 --> 0.662 | Weights_l2 --> 8676.698 | Lr --> 0.008 | Seconds_per_step --> 4.267 |
211
+ [2024-08-09 21:15:31,359][Main][INFO] - [train] Step 9700 out of 80000 | Loss --> 2.701 | Grad_l2 --> 0.620 | Weights_l2 --> 8677.665 | Lr --> 0.008 | Seconds_per_step --> 4.136 |
212
+ [2024-08-09 21:19:05,681][Main][INFO] - [train] Step 9750 out of 80000 | Loss --> 2.696 | Grad_l2 --> 0.635 | Weights_l2 --> 8678.669 | Lr --> 0.008 | Seconds_per_step --> 4.286 |
213
+ [2024-08-09 21:22:39,126][Main][INFO] - [train] Step 9800 out of 80000 | Loss --> 2.698 | Grad_l2 --> 0.652 | Weights_l2 --> 8679.660 | Lr --> 0.008 | Seconds_per_step --> 4.269 |
214
+ [2024-08-09 21:26:12,926][Main][INFO] - [train] Step 9850 out of 80000 | Loss --> 2.691 | Grad_l2 --> 0.629 | Weights_l2 --> 8680.657 | Lr --> 0.008 | Seconds_per_step --> 4.276 |
215
+ [2024-08-09 21:29:43,650][Main][INFO] - [train] Step 9900 out of 80000 | Loss --> 2.683 | Grad_l2 --> 0.639 | Weights_l2 --> 8681.671 | Lr --> 0.008 | Seconds_per_step --> 4.214 |
216
+ [2024-08-09 21:33:15,612][Main][INFO] - [train] Step 9950 out of 80000 | Loss --> 2.678 | Grad_l2 --> 0.624 | Weights_l2 --> 8682.710 | Lr --> 0.008 | Seconds_per_step --> 4.239 |
217
+ [2024-08-09 21:36:48,784][Main][INFO] - [train] Step 10000 out of 80000 | Loss --> 2.683 | Grad_l2 --> 0.631 | Weights_l2 --> 8683.746 | Lr --> 0.008 | Seconds_per_step --> 4.263 |
218
+ [2024-08-09 21:36:48,785][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-10000
219
+ [2024-08-09 21:36:48,789][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
220
+ [2024-08-09 21:36:50,921][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-10000/model.safetensors
221
+ [2024-08-09 21:36:54,146][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-10000/optimizer.bin
222
+ [2024-08-09 21:36:54,146][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-10000/scheduler.bin
223
+ [2024-08-09 21:36:54,146][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-10000/sampler.bin
224
+ [2024-08-09 21:36:54,147][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-10000/sampler_1.bin
225
+ [2024-08-09 21:36:54,147][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-10000/random_states_0.pkl
226
+ [2024-08-09 21:40:24,314][Main][INFO] - [train] Step 10050 out of 80000 | Loss --> 2.672 | Grad_l2 --> 0.620 | Weights_l2 --> 8684.763 | Lr --> 0.008 | Seconds_per_step --> 4.311 |
227
+ [2024-08-09 21:43:54,934][Main][INFO] - [train] Step 10100 out of 80000 | Loss --> 2.668 | Grad_l2 --> 0.630 | Weights_l2 --> 8685.788 | Lr --> 0.008 | Seconds_per_step --> 4.212 |
228
+ [2024-08-09 21:47:26,893][Main][INFO] - [train] Step 10150 out of 80000 | Loss --> 2.664 | Grad_l2 --> 0.622 | Weights_l2 --> 8686.819 | Lr --> 0.008 | Seconds_per_step --> 4.239 |
229
+ [2024-08-09 21:50:55,047][Main][INFO] - [train] Step 10200 out of 80000 | Loss --> 2.647 | Grad_l2 --> 0.609 | Weights_l2 --> 8687.859 | Lr --> 0.008 | Seconds_per_step --> 4.163 |
230
+ [2024-08-09 21:54:22,462][Main][INFO] - [train] Step 10250 out of 80000 | Loss --> 2.655 | Grad_l2 --> 0.613 | Weights_l2 --> 8688.883 | Lr --> 0.008 | Seconds_per_step --> 4.148 |
231
+ [2024-08-09 21:57:52,835][Main][INFO] - [train] Step 10300 out of 80000 | Loss --> 2.637 | Grad_l2 --> 0.623 | Weights_l2 --> 8689.917 | Lr --> 0.008 | Seconds_per_step --> 4.207 |
232
+ [2024-08-09 22:01:30,833][Main][INFO] - [train] Step 10350 out of 80000 | Loss --> 2.650 | Grad_l2 --> 0.636 | Weights_l2 --> 8690.965 | Lr --> 0.008 | Seconds_per_step --> 4.360 |
233
+ [2024-08-09 22:04:59,449][Main][INFO] - [train] Step 10400 out of 80000 | Loss --> 2.630 | Grad_l2 --> 0.619 | Weights_l2 --> 8691.976 | Lr --> 0.008 | Seconds_per_step --> 4.172 |
234
+ [2024-08-09 22:08:29,303][Main][INFO] - [train] Step 10450 out of 80000 | Loss --> 2.617 | Grad_l2 --> 0.615 | Weights_l2 --> 8693.000 | Lr --> 0.008 | Seconds_per_step --> 4.197 |
235
+ [2024-08-09 22:12:03,306][Main][INFO] - [train] Step 10500 out of 80000 | Loss --> 2.627 | Grad_l2 --> 0.615 | Weights_l2 --> 8694.037 | Lr --> 0.008 | Seconds_per_step --> 4.280 |
236
+ [2024-08-09 22:15:37,789][Main][INFO] - [train] Step 10550 out of 80000 | Loss --> 2.612 | Grad_l2 --> 0.594 | Weights_l2 --> 8695.071 | Lr --> 0.008 | Seconds_per_step --> 4.290 |
237
+ [2024-08-09 22:19:13,830][Main][INFO] - [train] Step 10600 out of 80000 | Loss --> 2.599 | Grad_l2 --> 0.608 | Weights_l2 --> 8696.095 | Lr --> 0.008 | Seconds_per_step --> 4.321 |
238
+ [2024-08-09 22:22:47,537][Main][INFO] - [train] Step 10650 out of 80000 | Loss --> 2.598 | Grad_l2 --> 0.619 | Weights_l2 --> 8697.144 | Lr --> 0.008 | Seconds_per_step --> 4.274 |
239
+ [2024-08-09 22:26:27,089][Main][INFO] - [train] Step 10700 out of 80000 | Loss --> 2.602 | Grad_l2 --> 0.627 | Weights_l2 --> 8698.176 | Lr --> 0.008 | Seconds_per_step --> 4.391 |
240
+ [2024-08-09 22:30:08,291][Main][INFO] - [train] Step 10750 out of 80000 | Loss --> 2.598 | Grad_l2 --> 0.603 | Weights_l2 --> 8699.195 | Lr --> 0.008 | Seconds_per_step --> 4.424 |
241
+ [2024-08-09 22:33:50,515][Main][INFO] - [train] Step 10800 out of 80000 | Loss --> 2.600 | Grad_l2 --> 0.615 | Weights_l2 --> 8700.255 | Lr --> 0.008 | Seconds_per_step --> 4.444 |
242
+ [2024-08-09 22:37:23,733][Main][INFO] - [train] Step 10850 out of 80000 | Loss --> 2.588 | Grad_l2 --> 0.604 | Weights_l2 --> 8701.311 | Lr --> 0.008 | Seconds_per_step --> 4.264 |
243
+ [2024-08-09 22:40:59,607][Main][INFO] - [train] Step 10900 out of 80000 | Loss --> 2.585 | Grad_l2 --> 0.605 | Weights_l2 --> 8702.327 | Lr --> 0.008 | Seconds_per_step --> 4.317 |
244
+ [2024-08-09 22:44:33,158][Main][INFO] - [train] Step 10950 out of 80000 | Loss --> 2.581 | Grad_l2 --> 0.595 | Weights_l2 --> 8703.360 | Lr --> 0.008 | Seconds_per_step --> 4.271 |
245
+ [2024-08-09 22:48:11,589][Main][INFO] - [train] Step 11000 out of 80000 | Loss --> 2.580 | Grad_l2 --> 0.601 | Weights_l2 --> 8704.410 | Lr --> 0.008 | Seconds_per_step --> 4.369 |
246
+ [2024-08-09 22:51:45,840][Main][INFO] - [train] Step 11050 out of 80000 | Loss --> 2.578 | Grad_l2 --> 0.587 | Weights_l2 --> 8705.448 | Lr --> 0.008 | Seconds_per_step --> 4.285 |